https://pypdf.readthedocs.io/en/stable/
创建Annotation的时候,很多样式控制不生效
尝试获取PDF Annotation的信息(但是获取的信息不正确,以下是部分代码):
# save annotations to local file def save_annotations(filepath:str, filename:str): filepath = os.path.expanduser(filepath) file_fullpath = os.path.join(filepath, filename) if not os.path.isfile(file_fullpath) or not file_fullpath.endswith(".pdf"): return reader = PdfReader(file_fullpath) data = dict() for page in reader.pages: pagelist = [] if "/Annots" not in page: continue for annot in page["/Annots"]: obj = annot.get_object() # 在这个obj的数据中,并没有找到和注释字体样式相关的信息 subtype = obj["/Subtype"] annotation = {"subtype": subtype, "rect": obj["/Rect"]} if subtype == "/FreeText": annotation["da"] = obj["/DA"] # 这里获取的DA信息也是错误的,类似://Helvetica 12 Tf 0 g annotation["contents"] = obj["/Contents"] elif subtype == "/Highlight": annotation["quadpoints"] = obj["/QuadPoints"] pagelist.append(annotation) data[page.page_number] = pagelist with open(file_fullpath[:-3]+"pkl", "wb") as file: pickle.dump(data, file)
https://pymupdf.readthedocs.io/en/latest/
这个包好像又叫作:fitz
使用PyMuPDF获取注释信息(代码来自AI,获取的信息不正确):
import fitz # 导入PyMuPDF库 def extract_freetext_annotations(pdf_path, output_file): # 打开PDF文件 doc = fitz.open(pdf_path) results = [] # 遍历每一页 for page in doc: # 遍历页面的注释 for annot in page.annots(types=fitz.PDF_ANNOT_FREE_TEXT): if annot is None: continue xref = annot.xref # 获取注释的 XREF # 通过 XREF 获取注释对象的字典 annot_dict = doc.xref_object(xref, compressed=False) # 打印查看注释对象的详细内容 print(annot_dict) ''' 输出非常长的一段文本,其中包括: /BS << /W 5 >> /DA (//Helvetica 12 Tf 0 g) /Border [ 0 0 5 ] /F 4 /Type /Annot /M (D:20240419110615Z00'00') /Rect [ 83.86732 245.2609 456.2475 636.324 ] /Subtype /FreeText /IC [ 1 .3837403 .3160096 ] /AP << /N 9046 0 R >> 可以看到无法获得正确的样式信息 ''' print(annot.colors.items()) # 输出两个颜色,但是是Annotation的背景填充色和边框颜色,和字体颜色无关 # dict_items([('stroke', [0.3248277008533478, 0.8369094729423523, 0.9915711283683777]), ('fill', [1.0, 0.38374030590057373, 0.31600961089134216])]) print(doc.xref_get_key(annot.xref, "AP/N/Resources/Font")) # ('dict', '<</C1 9050 0 R/TT1 9056 0 R>>') print(doc.xref_get_key(annot.xref, "DA")) # 依然是错误的信息 # ('string', '//Helvetica 12 Tf 0 g') # 获取注释的文本内容和位置 info = { "text": annot.info["content"], # 注释的文本内容 "position": str(annot.rect), # 注释的位置 "color": str(annot.colors.get("stroke", "Not specified")) # 安全获取颜色信息,如果没有设置则返回"Not specified" } results.append(info) doc.close() # 将结果保存到文件中 with open(output_file, "w") as f: for item in results: f.write(f"Text: {item['text']}\n") f.write(f"Position: {item['position']}\n") f.write(f"Color: {item['color']}\n") f.write("-" * 40 + "\n") # 使用示例 pdf_path = 'input.pdf' # 输入的PDF文件路径 output_file = 'annotations.txt' # 输出信息的文件路径 extract_freetext_annotations(pdf_path, output_file)