python3
pip install pdfminer3k
# -*- encoding: utf-8 -*- try: from urllib.request import urlopen except: from urllib import urlopen from io import StringIO from pdfminer.pdfinterp import PDFResourceManager, process_pdf from pdfminer.converter import TextConverter from pdfminer.layout import LAParams # 读取pdf的函数,返回内容 def readPdf(pdf_file): rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() device = TextConverter(rsrcmgr=rsrcmgr, outfp=retstr, laparams=laparams) process_pdf(rsrcmgr=rsrcmgr, device=device, fp=pdf_file) device.close() content = retstr.getvalue() retstr.close() return content url = "http://www.pythonscraping.com/pages/warandpeace/chapter1.pdf" pdf_file = urlopen(url) # 也可以换成本地pdf文件,用open rb模式打开 content = readPdf(pdf_file) print(content) pdf_file.close()
python2
下载:https://pypi.python.org/pypi/pdfminer/
pip install pdfminer
from cStringIO import StringIO from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from pdfminer.pdfpage import PDFPage def readPdf2(path): rsrcmgr = PDFResourceManager() retstr = StringIO() device = TextConverter(rsrcmgr, retstr, codec='utf-8', laparams=LAParams()) interpreter = PDFPageInterpreter(rsrcmgr, device) with open(path, 'rb') as fp: for page in PDFPage.get_pages(fp, set()): interpreter.process_page(page) text = retstr.getvalue() device.close() retstr.close() return text text = readPdf2("path")