说明
lxml最大的优势在于可以用相似的API处理html和xml
lxml底层是C++,因此需要对Bytes、python字符串、字符编码、CRLF等问题进行函数封装
lxml是目前作者已知的唯一一个可以操作xml名空间的库
基本功能
基本用法
from pathlib import Path
import lxml.etree
import lxml.html
def modify_xml(souce_path: Path, dst_path: Path):
# 获取根节点
parser = lxml.parser.XMLParser(encoding="gbk")
tree = lxml.etree.XML(souce_path.read_bytes(),parser)
# 获取目标节点
my_node = tree.xpath("/xpath")[0]
# 修改节点内容
my_node.attrib["name"] = "name"
my_node.text = "节点的内容"
#删除节点
useless_node = tree.cssselect("/csspath")[0]
tree.remove(useless_node)
# 写入
dst_path.write_bytes(
lxml.etree.tostring(
tree,
encoding="GB2312",
xml_declaration='<?xml version="1.0"?>',
pretty_print=True,
)
)
def get_html_info(souce_path):
parser = lxml.html.HTMLParser(encoding="gbk")
root = lxml.html.parse(str(souce_path), parser).getroot()
# 获取目标节点
result_node = [node.text for node in root.xpath("/html/body/table[2]/tr[1]/td[0]")][0]
print(result_node.text)
常用函数
from copy import deepcopy
import re
from pathlib import Path
import lxml.etree
import lxml.html
def new_node(
tag, attributes=None, text="", children=(), copy_children=False, nsmap=None
):
result = lxml.etree.Element(tag, attributes or {}, nsmap=nsmap)
if not (text is None or text == ""):
result.text = text
if children:
if copy_children:
result.extend(deepcopy(children))
else:
result.extend(children)
return result
def new_document(text, encoding="utf-8", parser=None):
"text: 可以是str或bytes。如果text是str类型,则encoding会被忽略"
if isinstance(text, bytes):
text = text.decode(encoding, "replace")
text = text.replace("\r", "").encode("utf-8")
# 在HTML场景则应该return lxml.etree.HTML(text, parser)
return lxml.etree.XML(text, parser)
def purify_text(text, encoding="utf-8"):
"如果text是str类型,则encoding会被忽略"
if isinstance(text, bytes):
text = text.decode(encoding, "replace")
text = text.replace("\r", "")
text = re.sub(r"\s*\n\s*", "\n", text)
return text
def write_html(tree, path: Path):
# html节点写入文件
path.write_bytes(
lxml.html.tostring(tree, encoding="utf-8", pretty_print=True, method="xml")
)
def write_xml(tree, path: Path):
path.write_bytes(
lxml.etree.tostring(
tree,
encoding="GB2312",
doctype='<?xml version="1.0"?>',
pretty_print=True,
)
)
Clean
用Cleaner清理html中指定类型的节点,默认清除表单和控件等
参考:https://lxml.de/api/lxml.html.clean.Cleaner-class.html
from lxml.html.clean import Cleaner
Cleaner(root)
名空间相关
以epub格式的content.opf文件为例
# 这里的opf等键是为了方便指定的,可以和xml文件中实际的名空间名称不一致
EPUBNS = {
"opf": "http://www.idpf.org/2007/opf",
"dc": "http://purl.org/dc/elements/1.1/",
"ncx": "http://www.daisy.org/z3986/2005/ncx/",
"": "http://www.w3.org/1999/xhtml",
}
# 需用new_document读取一个框架,其中包含了所有本文件用到的名空间前缀
# tag在程序处理的实际过程中的真实名称为
# {http://www.daisy.org/z3986/2005/ncx/}navPoint
tag = "{%s}navPoint" % EPUBNS["ncx"]
# 在lxml.etree.tostring()中,只要pretty_print=True,那么名空间在最终输出时会以读取的框架中的写法正常输出,如ncx:navPoint