"""Created on Wed Aug 3 10:06:27 2022@author: 01394546"""importcv2importnumpyasnpimportmatplotlib.pyplotaspltimportpandasaspdfromPILimportImageimportcv2importnumpyasnpimportfitzimportpytesseractimportmatplotlib.pyplotaspltimportosfrompdf2imageimportconvert_from_pathimportpytesseractimportcsvdeftess_ocr(pdf_path):
images=convert_from_path(pdf_path, fmt='png', output_folder='./out/',
userpw='site',poppler_path=r'E:\poppler-0.68.0_x86\poppler-0.68.0\bin') doc=fitz.open(pdf_path)
result= {}
for (i,img) inenumerate(images):
print('正在处理{0}的第{1}/{2}页。。。'.format(pdf_path,i,len(images)))
print(np.asarray(img).shape)
b, g, r=cv2.split(np.asarray(img)) img_new1=cv2.merge([r, g, b]) xs,ys=cut_pdf2_pic(img_new1)
page1=doc.load_page(i) rect=page1.rectiflen(ys)==7:
cols=['行政区','区域','地址','事件描述','黑房东联系方式','事件时间']
else:
cols= [ '区域', '地址', '事件描述', '黑房东联系方式', '事件时间']
forjinrange(len(xs) -1):
foriinrange(len(ys) -1):
clip=fitz.Rect(((ys[i]-30)/ys[-1]) *rect.width, ((xs[j]-50)/xs[-1]) *rect.height,
((ys[i+1]+30)/ys[-1])*rect.width, ((xs[j+1]+50)/xs[-1])*rect.height)
a_text=page1.get_text(clip=clip)
result[cols[i]]=result.get(cols[i],[])
result[cols[i]].append(a_text.replace('\n',''))
print(result)
data=pd.DataFrame(result)
data.index.name='编号'data['标注名称'] =data['地址']
if'行政区'inlist(data.columns):
data['地名地址'] ='深圳市'+data['行政区']+data['区域'] +data['地址']
else:
data['地名地址'] ='深圳市'+data['区域'] +data['地址']
data['经度'] =Nonedata['纬度'] =Nonedata['被举报原因'] =data['事件描述']
data['举报日期[日期]'] =data['事件时间']
data[['标注名称', '地名地址', '经度', '纬度', '被举报原因','黑房东联系方式', '举报日期[日期]']].to_csv(os.path.join('./黑房东数据/OUT/', pdf_path.split('/')[-1]+'.csv'))
defcut_pdf2_pic(image):
gray=cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
binary=cv2.adaptiveThreshold(~gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 35, -5)
rows, cols=binary.shapescale=20kernel=cv2.getStructuringElement(cv2.MORPH_RECT, (cols//scale, 1))
eroded=cv2.erode(binary, kernel, iterations=1)
dilatedcol=cv2.dilate(eroded, kernel, iterations=1)
scale=20kernel=cv2.getStructuringElement(cv2.MORPH_RECT, (1, rows//scale))
eroded=cv2.erode(binary, kernel, iterations=1)
dilatedrow=cv2.dilate(eroded, kernel, iterations=1)
merge=cv2.add(dilatedcol, dilatedrow)
print('merge.shape:', merge.shape)x=merge.sum(axis=0) y=merge.sum(axis=1) ys=np.where(x>255*len(y) *0.6)xs=np.where(y>255*len(x) *0.80)ys=list(ys[0])
xs=list(xs[0])
xs2=[]
ys2=[]
xs=[0]+xsforjinrange(len(xs) -1):
ifxs[j+1] -xs[j] <20: continuexs2.append(xs[j])
xs2.append(xs[-1])
ys=[0]+ysforiinrange(len(ys) -1):
ifys[i+1] -ys[i] <20: continueys2.append(ys[i])
ys2.append(ys[-1])
print(xs)
print(xs2)
print(ys)
print(ys2)
xs=xs2ys=ys2returnxs,ysimportosTemp_Dir1='./OCR/'forroot, dirs, filesinos.walk(Temp_Dir1, topdown=False):
for (i_,file_path) inenumerate(files):
print(i_,file_path)
tess_ocr(os.path.join(root, file_path))