使用Python3将Markdown(.md)文本转换成html、pdf

使⽤Python3将Markdown（.md）⽂本转换成html、pdf ⼀、Markdown中不同的⽂本内容会分成不同的⽂本块，并通过markdown的语法控制进⾏⽂本的拼接，组成新的⽂件。

⼆、利⽤Python3实现（.md）⽂件转换成（.html）⽂件

在cmd命令⾏下进⼊（.py）⽂件⽬录下，使⽤命令进⾏执⾏

>python md2html.py <file.md> <file.html>

import sys, re

#⽣成器模块

def lines(file):

#在⽂本最后加⼀空⾏

for line in file: yield line

yield'\n'

def blocks(file):

#⽣成单独的⽂本块

block = []

for line in lines(file):

if line.strip():

block.append(line)

elif block:

yield''.join(block).strip()

block = []

#⽂本块处理程序

class Handler:

"""

处理程序⽗类

"""

def callback(self, prefix, name, *args):

method = getattr(self, prefix + name, None)

if callable(method): return method(*args)

def start(self, name):

self.callback('start_', name)

def end(self, name):

self.callback('end_', name)

def sub(self, name):

def substitution(match):

result = self.callback('sub_', name, match)

if result is None: result = up(0)

return result

return substitution

class HTMLRenderer(Handler):

"""

HTML处理程序，给⽂本块加相应的HTML标记

"""

def start_document(self):

print('<html><head><title>Python⽂本解析</title></head><body>')

def end_document(self):

print('</body></html>')

def start_paragraph(self):

print('<p >')

def end_paragraph(self):

print('</p>')

def start_heading(self):

print('<h2 >')

def end_heading(self):

print('</h2>')

def start_list(self):

print('<ul >')

def end_list(self):

print('</ul>')

def start_listitem(self):

print('<li>')

def end_listitem(self):

print('</li>')

def start_title(self):

print('<h1 >')

def end_title(self):

print('</h1>')

def sub_emphasis(self, match):

return('<em>%s</em>' % up(1))

def sub_url(self, match):

return('<a target="_blank" href="%s">%s</a>' % (up(1), up(1))) def sub_mail(self, match):

return('<a href="mailto:%s">%s</a>' % (up(1), up(1)))

def feed(self, data):

print(data)

#规则，判断每个⽂本块应该如何处理

class Rule:

"""

规则⽗类

"""

def action(self, block, handler):

"""

加标记

"""

handler.pe)

handler.feed(block)

return True

class HeadingRule(Rule):

"""

⼀号标题规则

"""

type = 'heading'

def condition(self, block):

"""

判断⽂本块是否符合规则

"""

return not'\n'in block and len(block) <= 70 and not block[-1] == ':'

class TitleRule(HeadingRule):

"""

⼆号标题规则

"""

type = 'title'

first = True

def condition(self, block):

if not self.first: return False

self.first = False

dition(self, block)

class ListItemRule(Rule):

"""

列表项规则

"""

type = 'listitem'

def condition(self, block):

return block[0] == '-'

def action(self, block, handler):

handler.pe)

handler.feed(block[1:].strip())

return True

class ListRule(ListItemRule):

"""

列表规则

"""

type = 'list'

inside = False

def condition(self, block):

return True

def action(self, block, handler):

if not self.inside dition(self, block): handler.pe)

self.inside = True

elif self.inside and dition(self, block): pe)

self.inside = False

return False

class ParagraphRule(Rule):

"""

段落规则

"""

type = 'paragraph'

def condition(self, block):

return True

class Code(Rule):

'''

代码框规则

⾼亮显⽰规则

。

。。

'''

pass

# 对整个⽂本进⾏解析

class Parser:

"""

解析器⽗类

"""

def__init__(self, handler):

self.handler = handler

self.rules = []

self.filters = []

def addRule(self, rule):

"""

添加规则

"""

self.rules.append(rule)

def addFilter(self, pattern, name):

"""

添加过滤器

"""

def filter(block, handler):

return re.sub(pattern, handler.sub(name), block)

self.filters.append(filter)

def parse(self, file):

"""

解析

"""

self.handler.start('document')

for block in blocks(file):

for filter in self.filters:

block = filter(block, self.handler)

for rule in self.rules:

dition(block):

last = rule.action(block, self.handler)

if last: break

d('document')

class BasicTextParser(Parser):

"""

纯⽂本解析器

"""

def__init__(self, handler):

Parser.__init__(self, handler)

self.addRule(ListRule())

self.addRule(ListItemRule())

self.addRule(TitleRule())

self.addRule(HeadingRule())

self.addRule(ParagraphRule())

self.addFilter(r'\*(.+?)\*', 'emphasis')

self.addFilter(r'([\.a-zA-Z/]+)', 'url')

self.addFilter(r'([\.a-zA-Z]+@[\.a-zA-Z]+[a-zA-Z]+)', 'mail') """

运⾏测试程序

"""

handler = HTMLRenderer()

parser = BasicTextParser(handler)

parser.parse(sys.stdin)

pdf转html

三、利⽤Python3将⽂本转化成pdf⽂件

命令>python md2pdf.py 源⽂件⽬标⽂件 [options]

Options:

-h --help show help document.

-v --version show version information.

-o --output translate sourcefile into html file.

-p --print translate sourcefile into pdf file and html file respectively.

-P --Print translate sourcefile into pdf file only.

import os,re

import sys,getopt

from enum import Enum

from subprocess import call

from functools import reduce

from docopt import docopt

__version__ = '1.0'

# 定义三个枚举类

# 定义表状态

class TABLE(Enum):

Init = 1

Format = 2

Table = 3

# 有序序列状态

class ORDERLIST(Enum):

Init = 1

List = 2

# 块状态

class BLOCK(Enum):

Init = 1

Block = 2

CodeBlock = 3

# 定义全局状态，并初始化状态

table_state = TABLE.Init

orderList_state = ORDERLIST.Init

block_state = BLOCK.Init

is_code = False

is_normal = True

temp_table_first_line = []

temp_table_first_line_str = ""

need_mathjax = False

def test_state(input):

global table_state, orderList_state, block_state, is_code, temp_table_first_line, temp_table_first_line_str

Code_List = ["python\n", "c++\n", "c\n"]

result = input

# 构建正则表达式规则

# 匹配块标识

pattern = repile(r'```(\s)*\n')

a = pattern.match(input)

# 普通块

if a and block_state == BLOCK.Init:

result = "<blockquote>"

block_state = BLOCK.Block

is_normal = False

# 特殊代码块

elif len(input) > 4 and input[0:3] == '```'and (input[3:9] == "python"or input[3:6] == "c++"or input[3:4]== "c") and block_state == BLOCK.Init: block_state = BLOCK.Block

result = "<code></br>"

is_code = True

is_normal = False

# 块结束

elif block_state == BLOCK.Block and input == '```\n':

if is_code:

result = "</code>"

else:

result = "</blockquote>"

block_state = BLOCK.Init

is_code = False

is_normal = False

elif block_state == BLOCK.Block:

pattern = repile(r'[\n\r\v\f\ ]')

result = pattern.sub(" ", result)

pattern = repile(r'\t')

result = pattern.sub(" " * 4, result)

result = "<span>" + result + "</span></br>"

is_normal = False

# 解析有序序列

if len(input) > 2 and input[0].isdigit() and input[1] == '.'and orderList_state == ORDERLIST.Init:

orderList_state = ORDERLIST.List

result = "<ol><li>" + input[2:] + "</li>"

is_normal = False

elif len(input) > 2 and input[0].isdigit() and input[1] == '.'and orderList_state == ORDERLIST.List:

result = "<li>" + input[2:] + "</li>"

is_normal = False

elif orderList_state == ORDERLIST.List and (len(input) <= 2 or input[0].isdigit() == False or input[1] != '.'): result = "</ol>" + input

orderList_state = ORDERLIST.Init

# 解析表格

pattern = repile(r'^((.+)\|)+((.+))$')

match = pattern.match(input)

if match:

l = input.split('|')

l[-1] = l[-1][:-1]

# 将空字符弹出列表

if l[0] == '':

l.pop(0)

if l[-1] == '':

l.pop(-1)

if table_state == TABLE.Init:

table_state = TABLE.Format

temp_table_first_line = l

temp_table_first_line_str = input

result = ""

elif table_state == TABLE.Format:

# 如果是表头与表格主题的分割线

if reduce(lambda a, b: a and b, [all_same(i,'-') for i in l], True):

table_state = TABLE.Table

result = "<table><thread><tr>"

is_normal = False

# 添加表头

for i in temp_table_first_line:

result += "<th>" + i + "</th>"

result += "</tr>"

result += "</thread><tbody>"

is_normal = False

else:

result = temp_table_first_line_str + "</br>" + input

table_state = TABLE.Init

elif table_state == TABLE.Table:

result = "<tr>"

for i in l:

result += "<td>" + i + "</td>"

result += "</tr>"

elif table_state == TABLE.Table:

table_state = TABLE.Init

result = "</tbody></table>" + result

elif table_state == TABLE.Format:

pass

return result

#　判断 lst 是否全由字符 sym 构成　

def all_same(lst, sym):

return not lst or sym * len(lst) == lst

# 处理标题

def handleTitle(s, n):

temp = "<h" + repr(n) + ">" + s[n:] + "</h" + repr(n) + ">"

return temp

使用Python3将Markdown(.md)文本转换成html、pdf

发布评论取消回复

最近发表

热门文章

标签列表