说明

lxml最大的优势在于可以用相似的API处理html和xml

lxml底层是C++,因此需要对Bytes、python字符串、字符编码、CRLF等问题进行函数封装

lxml是目前作者已知的唯一一个可以操作xml名空间的库

基本功能

基本用法

from pathlib import Path
import lxml.etree
import lxml.html

def modify_xml(souce_path: Path, dst_path: Path):
    # 获取根节点
    parser = lxml.parser.XMLParser(encoding="gbk")
    tree = lxml.etree.XML(souce_path.read_bytes(),parser)
    # 获取目标节点
    my_node = tree.xpath("/xpath")[0]
    # 修改节点内容
    my_node.attrib["name"] = "name"
    my_node.text = "节点的内容"
    #删除节点
    useless_node = tree.cssselect("/csspath")[0]
    tree.remove(useless_node)
    # 写入
    dst_path.write_bytes(
        lxml.etree.tostring(
            tree,
            encoding="GB2312",
            xml_declaration='<?xml version="1.0"?>',
            pretty_print=True,
        )
    )

def get_html_info(souce_path):
    parser = lxml.html.HTMLParser(encoding="gbk")
    root = lxml.html.parse(str(souce_path), parser).getroot()
    # 获取目标节点
    result_node = [node.text for node in root.xpath("/html/body/table[2]/tr[1]/td[0]")][0]
    print(result_node.text)

常用函数

from copy import deepcopy
import re
from pathlib import Path
import lxml.etree
import lxml.html


def new_node(
    tag, attributes=None, text="", children=(), copy_children=False, nsmap=None
):
    result = lxml.etree.Element(tag, attributes or {}, nsmap=nsmap)
    if not (text is None or text == ""):
        result.text = text
    if children:
        if copy_children:
            result.extend(deepcopy(children))
        else:
            result.extend(children)
    return result


def new_document(text, encoding="utf-8", parser=None):
    "text: 可以是str或bytes。如果text是str类型,则encoding会被忽略"
    if isinstance(text, bytes):
        text = text.decode(encoding, "replace")
    text = text.replace("\r", "").encode("utf-8")
    # 在HTML场景则应该return lxml.etree.HTML(text, parser)
    return lxml.etree.XML(text, parser)


def purify_text(text, encoding="utf-8"):
    "如果text是str类型,则encoding会被忽略"
    if isinstance(text, bytes):
        text = text.decode(encoding, "replace")
    text = text.replace("\r", "")
    text = re.sub(r"\s*\n\s*", "\n", text)
    return text

def write_html(tree, path: Path):
    # html节点写入文件
    path.write_bytes(
        lxml.html.tostring(tree, encoding="utf-8", pretty_print=True, method="xml")
    )


def write_xml(tree, path: Path):
    path.write_bytes(
        lxml.etree.tostring(
            tree,
            encoding="GB2312",
            doctype='<?xml version="1.0"?>',
            pretty_print=True,
        )
    )

Clean

用Cleaner清理html中指定类型的节点,默认清除表单和控件等
参考:https://lxml.de/api/lxml.html.clean.Cleaner-class.html

from lxml.html.clean import Cleaner
Cleaner(root)

名空间相关

以epub格式的content.opf文件为例

# 这里的opf等键是为了方便指定的,可以和xml文件中实际的名空间名称不一致
EPUBNS = {
 "opf": "http://www.idpf.org/2007/opf",
 "dc": "http://purl.org/dc/elements/1.1/",
 "ncx": "http://www.daisy.org/z3986/2005/ncx/",
 "": "http://www.w3.org/1999/xhtml",
}
# 需用new_document读取一个框架,其中包含了所有本文件用到的名空间前缀
# tag在程序处理的实际过程中的真实名称为
# {http://www.daisy.org/z3986/2005/ncx/}navPoint
tag = "{%s}navPoint" % EPUBNS["ncx"]
# 在lxml.etree.tostring()中,只要pretty_print=True,那么名空间在最终输出时会以读取的框架中的写法正常输出,如ncx:navPoint

更多详细内容请参考官网api说明官方次要参考