pdf文件版面分析--pdfplumber（python文档解析提取）

见贤思齐 · 发表于 2024-9-4 09:28:47

pdfplumber的特点1、它是一个纯python第三方库，适合python3.x版本2、它用来查看pdf各类信息，能有效提取文本、表格3、它不支持修改或生成pdf，也不支持对pdf扫描件的处理importglobimportpdfplumberimportrefromcollectionsimportdefaultdictimportjsonclassPDFProcessor:def__init__(self,filepath):self.filepath=filepath#打开文档，注意存放的位置self.pdf=pdfplumber.open(filepath)self.all_text=defaultdict(dict)self.allrow=0self.last_num=0defcheck_lines(self,page,top,buttom): """用于检查页面中的行，并根据给定的顶部和底部位置来合并行。""" #文本数据lines=page.extract_words()[::]text=''last_top=0last_check=0forlinrange(len(lines)):each_line=lines[l]check_re='(?:。|；|单位：元|单位：万元|币种：人民币|\d|报告(?:全文)?(?:（修订版）|（修订稿）|（更正后）)?)iftop==''andbuttom=='':ifabs(last_top-each_line['top'])0and(page.height*0.85-each_line['top'])>0andnotre.search(check_re,text):eliflast_check>0and(page.height*0.9-each_line['top'])>0andnotre.search(check_re,text):text=text+each_line['text']else:text=text+'\n'+each_line['text']eliftop=='':ifeach_line['top']>buttom:ifabs(last_top-each_line['top'])0and(page.height*0.85-each_line['top'])>0andnotre.search(check_re,text):text=text+each_line['text']else:text=text+'\n'+each_line['text']else:ifeach_line['top']buttom:ifabs(last_top-each_line['top'])0and(page.height*0.85-each_line['top'])>0andnotre.search(check_re,text):text=text+each_line['text']else:text=text+'\n'+each_line['text']last_top=each_line['top']last_check=each_line['x1']-page.width*0.85returntextdefdrop_empty_cols(self,data):#删除所有列为空数据的列transposed_data=list(map(list,zip(*data)))#转置数据filtered_data=[colforcolintransposed_dataifnotall(cellis''forcellincol)]#过滤掉空列result=list(map(list,zip(*filtered_data)))#再次转置数据returnresult@staticmethoddefkeep_visible_lines(obj):"""保留可见的线条。Iftheobjectisa``rect``type,keepitonlyifthelinesarevisible.Avisiblelineistheonehaving``non_stroking_color``notnull."""ifobj['object_type']=='rect':ifobj['non_stroking_color']isNone:returnFalseifobj['width']=1andobj['height']>=1andobj['non_stroking_color']isnotNoneifobj['object_type']=='char':returnobj['stroking_color']isnotNoneandobj['non_stroking_color']isnotNonereturnTruedefextract_text_and_tables(self,page): """从给定的页面中提取文本和表格。"""buttom=0page=page.filter(self.keep_visible_lines)tables=page.find_tables()iflen(tables)>=1: #表格数据count=len(tables)fortableintables:iftable.bbox[3]end_re='^(?:\d|\\|\/|第|共|页|-|_|){1,}'ifself.last_num==0:try:first_text=str(self.all_text[1]['inside'])end_text=str(self.all_text[len(self.all_text)-1]['inside'])ifre.search(first_re,first_text)andnot'['inend_text:self.all_text[1]['type']='页眉'ifre.search(end_re,end_text)andnot'['inend_text:self.all_text[len(self.all_text)-1]['type']='页脚'except:print(page.page_number)else:try:first_text=str(self.all_text[self.last_num+2]['inside'])end_text=str(self.all_text[len(self.all_text)-1]['inside'])ifre.search(first_re,first_text)and'['notinend_text:self.all_text[self.last_num+2]['type']='页眉'ifre.search(end_re,end_text)and'['notinend_text:self.all_text[len(self.all_text)-1]['type']='页脚'except:print(page.page_number)self.last_num=len(self.all_text)-1defprocess_pdf(self): """处理整个PDF文档。"""foriinrange(len(self.pdf.pages)):self.extract_text_and_tables(self.pdf.pages[i])defsave_all_text(self,path): """将提取的所有文本保存到指定路径的文件中。"""withopen(path,'w',encoding='utf-8')asfile:forkeyinself.all_text.keys():file.write(json.dumps(self.all_text[key],ensure_ascii=False)+'\n')defprocess_all_pdfs_in_folder(folder_path): """处理指定文件夹下的所有PDF文件。"""file_paths=glob.glob(f'{folder_path}/*')file_paths=sorted(file_paths,reverse=True)forfile_pathinfile_paths:print(file_path)try:processor=PDFProcessor(file_path)processor.process_pdf()save_path='RAG_ASMPLE_DATAS_TXTS/'+file_path.split('/')[-1].replace('.pdf','.txt')processor.save_all_text(save_path)except:print('check')if__name__=='__main__':#需要解析的pdf文件路径pdf_path=r'C:\Users\WWS\RAG_ASMPLE_DATAS\2020-02-26__上海爱旭新能源股份有限公司__600732__爱旭股份__2019年__年度报告.pdf'#pdf解析后的txt内容文件out_path=r'C:\Users\WWS\RAG_ASMPLE_DATAS\2020-02-26__上海爱旭新能源股份有限公司__600732__爱旭股份__2019年__年度报告.txt'processor=PDFProcessor(pdf_path)processor.process_pdf()processor.save_all_text(out_path)123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233参考版面分析–PDF解析神器pdfplumber版面分析–富文本txt读取补充提取PDF中的图片并保存到本地importpdfplumberimportos#定义函数用于提取PDF中的图片并保存defextract_images_from_pdf(pdf_file,output_folder):#创建输出文件夹，如果不存在的话ifnotos.path.exists(output_folder)

s.makedirs(output_folder)withpdfplumber.open(pdf_file)aspdf:#遍历每一页forpage_number,pageinenumerate(pdf.pages,start=1):print(f'页码：{page.page_number}')print(f'页面宽度：{page.width}')print(f'页面高度：{page.height}')#获取该页的所有图片images=page.images#遍历该页的所有图片foridx,imageinenumerate(images,start=1):#获取图片的二进制数据image_data=image['stream'].get_data()#构建图片文件名image_filename=os.path.join(output_folder,f'image_{page_number}_{idx}.png')#保存图片到文件withopen(image_filename,'wb')asf:f.write(image_data)print(f'图片已保存至：{image_filename}')#示例使用pdf_file='example.pdf'output_folder='extracted_images'extract_images_from_pdf(pdf_file,output_folder)123456789101112131415161718192021222324252627282930313233343536提取pdf表格文本，保存为excel文件importpdfplumberfromopenpyxlimportWorkbook#定义函数用于提取PDF中的表格并保存为Excel文件defextract_tables_to_excel(pdf_file,excel_output_file):withpdfplumber.open(pdf_file)aspdf:workbook=Workbook()sheet=workbook.active#遍历每一页forpageinpdf.pages:#提取该页的表格table=page.extract_table()#如果表格存在，则将其写入Excel文件iftable:forrowintable:sheet.append(row)#保存Excel文件workbook.save(excel_output_file)#示例使用pdf_file='example.pdf'excel_output_file='tables.xlsx'extract_tables_to_excel(pdf_file,excel_output_file)1234567891011121314151617181920212223242526提取PDF表格文本importpdfplumber#定义函数用于提取PDF中的表格并保存为文本文件defextract_tables_to_text(pdf_file,text_output_file):withpdfplumber.open(pdf_file)aspdf:withopen(text_output_file,'w',encoding='utf-8')asoutput:#遍历每一页forpageinpdf.pages:#提取该页的表格table=page.extract_table()#如果表格存在，则将其写入文本文件iftable:forrowintable

utput.write('\t'.join(str(cell)forcellinrow)+'\n')#示例使用pdf_file='example.pdf'text_output_file='tables.txt'extract_tables_to_text(pdf_file,text_output_file)1234567891011121314151617181920提取PDF纯文本importpdfplumber#定义函数用于提取PDF中的纯文本并保存为文本文件defextract_text_to_file(pdf_file,text_output_file):withpdfplumber.open(pdf_file)aspdf:withopen(text_output_file,'w',encoding='utf-8')asoutput:#遍历每一页forpageinpdf.pages:#提取该页的文本text=page.extract_text()#如果文本存在，则将其写入文本文件iftext

utput.write(text)#示例使用pdf_file='example.pdf'text_output_file='text.txt'extract_text_to_file(pdf_file,text_output_file) 1234567891011121314151617181920读取富文本txtpython读取文件函数有三种read()、readline()、readlines()read()一次性读取所有文本readline()读取第一行的内容readlines()读取全部内容，以数列的格式返回#一次性读取所有文本withopen('story.txt','r',encoding='utf-8')asf:data=f.read()print(data)#读取第一行的内容withopen('story.txt','r',encoding='utf-8')asf:data=f.readline()print(data)#读取全部内容，逐行读取并去除换行符withopen('story.txt','r',encoding='utf-8')asf:forlineinf.readlines():line=line.strip('\n')print(line)123456789101112131415

		自动登录	找回密码
密码			会员注册