|
1.介绍PyMuPDF和Fitz是用于Python中处理PDF文件的相关模块。Fitz是P有MuPDF的字模块。提供一个简化和封装版本的P有MuPDF功能。关系:PyMuPDF:提供广泛的功能,用于操作PDF文档,包括方便的高级函数与底层操作Fitz:简化和封装了PyMuPDF的功能,使在python中处理PDF文件更加简单2.基本操作获取PDF的文档基本信息#-*-coding:utf-8-*-#PyMuPDF==1.23.26importfitzpdf_path='rag_datas/text.pdf'doc=fitz.open(pdf_path)#文件加载#basicPDFinfotitle=doc.metadata['title']author=doc.metadata['author']#文档作者create_data=doc.metadata['creationDate']#文档创建时间num_pages=doc.page_count#文档页数page=doc.load_page(0)#第一页page_height=page.bound().heightpage_width=page.bound().width1234567891011121314151617获取pdf文档中的文本#-*-coding:utf-8-*-#PyMuPDF==1.23.26importfitzpdf_path='rag_datas/text.pdf'doc=fitz.open(pdf_path)#文件加载num_pages=doc.page_count#文档页数#Textinfoofpdfforpage_indexinrange(num_pages): page=doc.load_page(page_index)#获取页面内容 text=page.get_text()#获取页面文本 print(f"第{page_index+1}页的文本内容为:\n{text}\n")1234567891011121314获取pdf文档中的图片#-*-coding:utf-8-*-#PyMuPDF==1.23.26importfitzpdf_path='rag_datas/text.pdf'doc=fitz.open(pdf_path)#文件加载num_pages=doc.page_count#文档页数#Imageinfoofpdfforpage_indexinrange(num_pages): page=doc.load_page(page_index)#获取页面内容 image_list=page.get_images()#获取页面图片 print(image_list)#图片基本信息 forimginimage_list: xref=img[0] pix=fitz.Pixmap(doc,xref) print(pix.colorspace,'-->',fitz.csRGB) img_pathf'../output/image{page_index+1}_{xref}.png' pix.save(img_path)1234567891011121314151617181920获取pdf文档中的表格#-*-coding:utf-8-*-#PyMuPDF==1.23.26importfitzpdf_path='rag_datas/text.pdf'doc=fitz.open(pdf_path)#文件加载num_pages=doc.page_count#文档页数#tablesinfoofpdfforpage_indexinrange(num_pages): page=doc.load_page(page_index)#获取页面内容 tables=page.find_tables()#获取页面表格 print(f"tables:"{tables}) #提取的表格数据将会保存为csv格式文件 fori,tableinenumerate(tables): df=tables[0].to_pandas() print(df.head()) df.to_csv(f"../output/table_pd_{page_index}_{i+1}.csv",index=False) 12345678910111213141516171819获取pdf文档分割#-*-coding:utf-8-*-#PyMuPDF==1.23.26importfitzpdf_path='rag_datas/text.pdf'doc=fitz.open(pdf_path)#文件加载num_pages=doc.page_count#文档页数#构建输出文件名,以页数命名#foriinrange(1,num_pages): print(f"i"{i}") #创建一个新的Document对象,包含当前页面 new_pdf=fitz.open() new_pdf.insert_pdf(pdf_document.from_page=i-1,to_page=i) #保存单独的PDF文件 new_pdf.save(output_pdf.format(i)) new_pdf.close()pdf_document.close()12345678910111213141516171819借助大模型进行文档问答#-*-coding:utf-8-*-#PyMuPDF==1.23.26importosimportfitzfromopenaiimportOpenAIdefget_pdf_content(pdf_path:str)->str: doc=fitz.open(pdf_path) num_pages=doc.page_count bg_content_list=[] #FullTextofPDF forpage_indexinrange(num_pages): page=doc.load_page(page_index) text=page.get_text() bg_content_list.append(text) return''.join(bg_content_list)defget_answer(pdf_content:str,query:str)->str: client=OpenAI(api_key=os.getenv("OPENAI_API_KEY")) response=clinet.chat.completions.create( model="gpt-3.5-turbo", messages=[ {"role":"system","content":"Youareahelpfulassistant."}, {"role":"user","content":"f"ThefulltextofPDFfileis:{pdf_content}"}, {"role":"user","content":query} ], max_tokens=1000 ) answer=response.choices[0].message.content returnanswerif__name__=="__main__":content=get_pdf_content("rag_datas/text.pdf")query_1='蚂蚁集团发布的大模型叫什么?'print(get_answer(pdf_content=content,query=query_1)) query_2='混元大模型是什么时候发布的?'print(get_answer(pdf_content=content,query=query_2))12345678910111213141516171819202122232425262728293031323334353637383940参考:版面分析–PDF解析神器PyMuPDFgithub:https://github.com/pymupdf/PyMuPDF官方文档:https://pymupdf.readthedocs.io/en/latest/tutorial.html
|
|