Table of Contents

Python操作PDF

关联项目:https://github.com/GZhonghui/AnnotationSync

目前还没有找到特别完美的处理PDF的第三方库

PDF格式相关的内容:PDF


pypdf(有bug)

https://pypdf.readthedocs.io/en/stable/

创建Annotation的时候,很多样式控制不生效

尝试获取PDF Annotation的信息(但是获取的信息不正确,以下是部分代码):

# save annotations to local file
def save_annotations(filepath:str, filename:str):
    filepath = os.path.expanduser(filepath)
    file_fullpath = os.path.join(filepath, filename)
    if not os.path.isfile(file_fullpath) or not file_fullpath.endswith(".pdf"):
        return
 
    reader = PdfReader(file_fullpath)
 
    data = dict()
    for page in reader.pages:
        pagelist = []
        if "/Annots" not in page:
            continue
        for annot in page["/Annots"]:
            obj = annot.get_object() # 在这个obj的数据中,并没有找到和注释字体样式相关的信息
            subtype = obj["/Subtype"]
            annotation = {"subtype": subtype, "rect": obj["/Rect"]}
 
            if subtype == "/FreeText":
                annotation["da"] = obj["/DA"] # 这里获取的DA信息也是错误的,类似://Helvetica 12 Tf 0 g
                annotation["contents"] = obj["/Contents"]
            elif subtype == "/Highlight":
                annotation["quadpoints"] = obj["/QuadPoints"]
 
            pagelist.append(annotation)
        data[page.page_number] = pagelist
 
    with open(file_fullpath[:-3]+"pkl", "wb") as file:        
        pickle.dump(data, file)

PyMuPDF(有bug)

https://pymupdf.readthedocs.io/en/latest/

这个包好像又叫作:fitz

使用PyMuPDF获取注释信息(代码来自AI,获取的信息不正确):

import fitz  # 导入PyMuPDF库
 
def extract_freetext_annotations(pdf_path, output_file):
    # 打开PDF文件
    doc = fitz.open(pdf_path)
    results = []
 
    # 遍历每一页
    for page in doc:
        # 遍历页面的注释
        for annot in page.annots(types=fitz.PDF_ANNOT_FREE_TEXT):
            if annot is None:
                continue
 
            xref = annot.xref  # 获取注释的 XREF
            # 通过 XREF 获取注释对象的字典
            annot_dict = doc.xref_object(xref, compressed=False)
            # 打印查看注释对象的详细内容
            print(annot_dict)
            '''
            输出非常长的一段文本,其中包括:
            /BS <<
                /W 5
            >>
            /DA (//Helvetica 12 Tf 0 g)
            /Border [ 0 0 5 ]
            /F 4
            /Type /Annot
            /M (D:20240419110615Z00'00')
            /Rect [ 83.86732 245.2609 456.2475 636.324 ]
            /Subtype /FreeText
            /IC [ 1 .3837403 .3160096 ]
            /AP <<
                /N 9046 0 R
            >>
 
            可以看到无法获得正确的样式信息
            '''
 
            print(annot.colors.items())
            # 输出两个颜色,但是是Annotation的背景填充色和边框颜色,和字体颜色无关
            # dict_items([('stroke', [0.3248277008533478, 0.8369094729423523, 0.9915711283683777]), ('fill', [1.0, 0.38374030590057373, 0.31600961089134216])])
 
            print(doc.xref_get_key(annot.xref, "AP/N/Resources/Font"))
            # ('dict', '<</C1 9050 0 R/TT1 9056 0 R>>')
 
            print(doc.xref_get_key(annot.xref, "DA"))
            # 依然是错误的信息
            # ('string', '//Helvetica 12 Tf 0 g')
 
            # 获取注释的文本内容和位置
            info = {
                "text": annot.info["content"],  # 注释的文本内容
                "position": str(annot.rect),  # 注释的位置
                "color": str(annot.colors.get("stroke", "Not specified"))  # 安全获取颜色信息,如果没有设置则返回"Not specified"
            }
 
            results.append(info)
 
    doc.close()
 
    # 将结果保存到文件中
    with open(output_file, "w") as f:
        for item in results:
            f.write(f"Text: {item['text']}\n")
            f.write(f"Position: {item['position']}\n")
            f.write(f"Color: {item['color']}\n")
            f.write("-" * 40 + "\n")
 
# 使用示例
pdf_path = 'input.pdf'  # 输入的PDF文件路径
output_file = 'annotations.txt'  # 输出信息的文件路径
extract_freetext_annotations(pdf_path, output_file)