开发者社区> 小麋鹿666> 正文

PDFToText with ITextSharp--Extract text from PDF in C# (100% .NET)(推荐)

简介:
+关注继续查看
None.gifusing System;
None.gif
using System.IO;
None.gif
using iTextSharp.text.pdf;
None.gif
None.gif
namespace PdfToText
ExpandedBlockStart.gifContractedBlock.gif
dot.gif{
ExpandedSubBlockStart.gifContractedSubBlock.gif    
/**//// <summary>
InBlock.gif    
/// Parses a PDF file and extracts the text from it.
ExpandedSubBlockEnd.gif    
/// </summary>

InBlock.gif    public class PDFParser 
ExpandedSubBlockStart.gifContractedSubBlock.gif    
dot.gif{
ExpandedSubBlockStart.gifContractedSubBlock.gif        
/**//// BT = Beginning of a text object operator 
InBlock.gif        
/// ET = End of a text object operator
InBlock.gif        
/// Td move to the start of next line
InBlock.gif        
///  5 Ts = superscript
ExpandedSubBlockEnd.gif        
/// -5 Ts = subscript

InBlock.gif
ContractedSubBlock.gifExpandedSubBlockStart.gif        
Fields#region Fields
InBlock.gif
ContractedSubBlock.gifExpandedSubBlockStart.gif        
_numberOfCharsToKeep#region _numberOfCharsToKeep
ExpandedSubBlockStart.gifContractedSubBlock.gif        
/**//// <summary>
InBlock.gif        
/// The number of characters to keep, when extracting text.
ExpandedSubBlockEnd.gif        
/// </summary>

InBlock.gif        private static int _numberOfCharsToKeep = 15;
ExpandedSubBlockEnd.gif        
#endregion

InBlock.gif
ExpandedSubBlockEnd.gif        
#endregion

InBlock.gif
ContractedSubBlock.gifExpandedSubBlockStart.gif        
ExtractText#region ExtractText
ExpandedSubBlockStart.gifContractedSubBlock.gif        
/**//// <summary>
InBlock.gif        
/// Extracts a text from a PDF file.
InBlock.gif        
/// </summary>
InBlock.gif        
/// <param name="inFileName">the full path to the pdf file.</param>
InBlock.gif        
/// <param name="outFileName">the output file name.</param>
ExpandedSubBlockEnd.gif        
/// <returns>the extracted text</returns>

InBlock.gif        public bool ExtractText(string inFileName, string outFileName)
ExpandedSubBlockStart.gifContractedSubBlock.gif        
dot.gif{
InBlock.gif            StreamWriter outFile 
= null;
InBlock.gif            
try
ExpandedSubBlockStart.gifContractedSubBlock.gif            
dot.gif{
InBlock.gif                
// Create a reader for the given PDF file
InBlock.gif
                PdfReader reader = new PdfReader(inFileName);
InBlock.gif                
//outFile = File.CreateText(outFileName);
InBlock.gif
                outFile = new StreamWriter(outFileName, false, System.Text.Encoding.UTF8);
InBlock.gif                
InBlock.gif                Console.Write(
"Processing: ");
InBlock.gif                
InBlock.gif                
int     totalLen    = 68;
InBlock.gif                
float   charUnit    = ((float)totalLen) / (float)reader.NumberOfPages;
InBlock.gif                
int     totalWritten= 0;
InBlock.gif                
float   curUnit     = 0;
InBlock.gif
InBlock.gif                
for (int page = 1; page <= reader.NumberOfPages; page++)
ExpandedSubBlockStart.gifContractedSubBlock.gif                
dot.gif{                    
InBlock.gif                    outFile.Write(ExtractTextFromPDFBytes(reader.GetPageContent(page)) 
+ " ");
InBlock.gif                    
InBlock.gif                    
// Write the progress.
InBlock.gif
                    if (charUnit >= 1.0f)
ExpandedSubBlockStart.gifContractedSubBlock.gif                    
dot.gif{
InBlock.gif                        
for (int i = 0; i < (int)charUnit; i++)
ExpandedSubBlockStart.gifContractedSubBlock.gif                        
dot.gif{
InBlock.gif                            Console.Write(
"#");
InBlock.gif                            totalWritten
++;
ExpandedSubBlockEnd.gif                        }

ExpandedSubBlockEnd.gif                    }

InBlock.gif                    
else
ExpandedSubBlockStart.gifContractedSubBlock.gif                    
dot.gif{
InBlock.gif                        curUnit 
+= charUnit;
InBlock.gif                        
if (curUnit >= 1.0f)
ExpandedSubBlockStart.gifContractedSubBlock.gif                        
dot.gif{
InBlock.gif                            
for (int i = 0; i < (int)curUnit; i++)
ExpandedSubBlockStart.gifContractedSubBlock.gif                            
dot.gif{
InBlock.gif                                Console.Write(
"#");
InBlock.gif                                totalWritten
++;
ExpandedSubBlockEnd.gif                            }

InBlock.gif                            curUnit 
= 0;
ExpandedSubBlockEnd.gif                        }

InBlock.gif                        
ExpandedSubBlockEnd.gif                    }

ExpandedSubBlockEnd.gif                }

InBlock.gif
InBlock.gif                
if (totalWritten < totalLen)
ExpandedSubBlockStart.gifContractedSubBlock.gif                
dot.gif{
InBlock.gif                    
for (int i = 0; i < (totalLen - totalWritten); i++)
ExpandedSubBlockStart.gifContractedSubBlock.gif                    
dot.gif{
InBlock.gif                        Console.Write(
"#");
ExpandedSubBlockEnd.gif                    }

ExpandedSubBlockEnd.gif                }

InBlock.gif                
return true;
ExpandedSubBlockEnd.gif            }

InBlock.gif            
catch
ExpandedSubBlockStart.gifContractedSubBlock.gif            
dot.gif{
InBlock.gif                
return false;
ExpandedSubBlockEnd.gif            }

InBlock.gif            
finally
ExpandedSubBlockStart.gifContractedSubBlock.gif            
dot.gif{
InBlock.gif                
if (outFile != null) outFile.Close();
ExpandedSubBlockEnd.gif            }

ExpandedSubBlockEnd.gif        }

ExpandedSubBlockEnd.gif        
#endregion

InBlock.gif
ContractedSubBlock.gifExpandedSubBlockStart.gif        
ExtractTextFromPDFBytes#region ExtractTextFromPDFBytes
ExpandedSubBlockStart.gifContractedSubBlock.gif        
/**//// <summary>
InBlock.gif        
/// This method processes an uncompressed Adobe (text) object 
InBlock.gif        
/// and extracts text.
InBlock.gif        
/// </summary>
InBlock.gif        
/// <param name="input">uncompressed</param>
ExpandedSubBlockEnd.gif        
/// <returns></returns>

InBlock.gif        private string ExtractTextFromPDFBytes(byte[] input)
ExpandedSubBlockStart.gifContractedSubBlock.gif        
dot.gif{
InBlock.gif            
if (input == null || input.Length == 0return "";
InBlock.gif
InBlock.gif            
try
ExpandedSubBlockStart.gifContractedSubBlock.gif            
dot.gif{
InBlock.gif                
string resultString = "";
InBlock.gif
InBlock.gif                
// Flag showing if we are we currently inside a text object
InBlock.gif
                bool inTextObject = false;
InBlock.gif
InBlock.gif                
// Flag showing if the next character is literal 
InBlock.gif                
// e.g. '\\' to get a '\' character or '\(' to get '('
InBlock.gif
                bool nextLiteral = false;
InBlock.gif
InBlock.gif                
// () Bracket nesting level. Text appears inside ()
InBlock.gif
                int bracketDepth = 0;
InBlock.gif
InBlock.gif                
// Keep previous chars to get extract numbers etc.:
InBlock.gif
                char[] previousCharacters = new char[_numberOfCharsToKeep];
InBlock.gif                
for (int j = 0; j < _numberOfCharsToKeep; j++) previousCharacters[j] = ' ';
InBlock.gif
InBlock.gif
InBlock.gif                
for (int i = 0; i < input.Length; i++)
ExpandedSubBlockStart.gifContractedSubBlock.gif                
dot.gif{
InBlock.gif                    
char c = (char)input[i];
InBlock.gif
InBlock.gif                    
if (inTextObject)
ExpandedSubBlockStart.gifContractedSubBlock.gif                    
dot.gif{
InBlock.gif                        
// Position the text
InBlock.gif
                        if (bracketDepth == 0)
ExpandedSubBlockStart.gifContractedSubBlock.gif                        
dot.gif{
ExpandedSubBlockStart.gifContractedSubBlock.gif                            
if (CheckToken(new string[] dot.gif"TD""Td" }, previousCharacters))
ExpandedSubBlockStart.gifContractedSubBlock.gif                            
dot.gif{
InBlock.gif                                resultString 
+= "\n\r";
ExpandedSubBlockEnd.gif                            }

InBlock.gif                            
else
ExpandedSubBlockStart.gifContractedSubBlock.gif                            
dot.gif{
ExpandedSubBlockStart.gifContractedSubBlock.gif                                
if (CheckToken(new string[] dot.gif{"'""T*""\""}, previousCharacters))
ExpandedSubBlockStart.gifContractedSubBlock.gif
                                dot.gif{
InBlock.gif                                    resultString 
+= "\n";
ExpandedSubBlockEnd.gif                                }

InBlock.gif                                
else
ExpandedSubBlockStart.gifContractedSubBlock.gif                                
dot.gif{
ExpandedSubBlockStart.gifContractedSubBlock.gif                                    
if (CheckToken(new string[] dot.gif"Tj" }, previousCharacters))
ExpandedSubBlockStart.gifContractedSubBlock.gif                                    
dot.gif{
InBlock.gif                                        resultString 
+= " ";
ExpandedSubBlockEnd.gif                                    }

ExpandedSubBlockEnd.gif                                }

ExpandedSubBlockEnd.gif                            }

ExpandedSubBlockEnd.gif                        }

InBlock.gif
InBlock.gif                        
// End of a text object, also go to a new line.
InBlock.gif
                        if (bracketDepth == 0 && 
ExpandedSubBlockStart.gifContractedSubBlock.gif                            CheckToken( 
new string[]dot.gif{"ET"}, previousCharacters))
ExpandedSubBlockStart.gifContractedSubBlock.gif                        
dot.gif{
InBlock.gif
InBlock.gif                            inTextObject 
= false;
InBlock.gif                            resultString 
+= " ";
ExpandedSubBlockEnd.gif                        }

InBlock.gif                        
else
ExpandedSubBlockStart.gifContractedSubBlock.gif                        
dot.gif{
InBlock.gif                            
// Start outputting text
InBlock.gif
                            if ((c == '('&& (bracketDepth == 0&& (!nextLiteral))
ExpandedSubBlockStart.gifContractedSubBlock.gif                            
dot.gif{
InBlock.gif                                bracketDepth 
= 1;
ExpandedSubBlockEnd.gif                            }

InBlock.gif                            
else
ExpandedSubBlockStart.gifContractedSubBlock.gif                            
dot.gif{
InBlock.gif                                
// Stop outputting text
InBlock.gif
                                if ((c == ')'&& (bracketDepth == 1&& (!nextLiteral))
ExpandedSubBlockStart.gifContractedSubBlock.gif                                
dot.gif{
InBlock.gif                                    bracketDepth 
= 0;
ExpandedSubBlockEnd.gif                                }

InBlock.gif                                
else
ExpandedSubBlockStart.gifContractedSubBlock.gif                                
dot.gif{
InBlock.gif                                    
// Just a normal text character:
InBlock.gif
                                    if (bracketDepth == 1)
ExpandedSubBlockStart.gifContractedSubBlock.gif                                    
dot.gif{
InBlock.gif                                        
// Only print out next character no matter what. 
InBlock.gif                                        
// Do not interpret.
InBlock.gif
                                        if (c == '\\' && !nextLiteral)
ExpandedSubBlockStart.gifContractedSubBlock.gif                                        
dot.gif{
InBlock.gif                                            nextLiteral 
= true;
ExpandedSubBlockEnd.gif                                        }

InBlock.gif                                        
else
ExpandedSubBlockStart.gifContractedSubBlock.gif                                        
dot.gif{
InBlock.gif                                            
if (((c >= ' '&& (c <= '~')) ||
InBlock.gif                                                ((c 
>= 128&& (c < 255)))
ExpandedSubBlockStart.gifContractedSubBlock.gif                                            
dot.gif{
InBlock.gif                                                resultString 
+= c.ToString();
ExpandedSubBlockEnd.gif                                            }

InBlock.gif
InBlock.gif                                            nextLiteral 
= false;
ExpandedSubBlockEnd.gif                                        }

ExpandedSubBlockEnd.gif                                    }

ExpandedSubBlockEnd.gif                                }

ExpandedSubBlockEnd.gif                            }

ExpandedSubBlockEnd.gif                        }

ExpandedSubBlockEnd.gif                    }

InBlock.gif
InBlock.gif                    
// Store the recent characters for 
InBlock.gif                    
// when we have to go back for a checking
InBlock.gif
                    for (int j = 0; j < _numberOfCharsToKeep - 1; j++)
ExpandedSubBlockStart.gifContractedSubBlock.gif                    
dot.gif{
InBlock.gif                        previousCharacters[j] 
= previousCharacters[j + 1];
ExpandedSubBlockEnd.gif                    }

InBlock.gif                    previousCharacters[_numberOfCharsToKeep 
- 1= c;
InBlock.gif
InBlock.gif                    
// Start of a text object
ExpandedSubBlockStart.gifContractedSubBlock.gif
                    if (!inTextObject && CheckToken(new string[]dot.gif{"BT"}, previousCharacters))
ExpandedSubBlockStart.gifContractedSubBlock.gif                    
dot.gif{
InBlock.gif                        inTextObject 
= true;
ExpandedSubBlockEnd.gif                    }

ExpandedSubBlockEnd.gif                }

InBlock.gif                
return resultString;
ExpandedSubBlockEnd.gif            }

InBlock.gif            
catch
ExpandedSubBlockStart.gifContractedSubBlock.gif            
dot.gif{
InBlock.gif                
return "";
ExpandedSubBlockEnd.gif            }

ExpandedSubBlockEnd.gif        }

ExpandedSubBlockEnd.gif        
#endregion

InBlock.gif
ContractedSubBlock.gifExpandedSubBlockStart.gif        
CheckToken#region CheckToken
ExpandedSubBlockStart.gifContractedSubBlock.gif        
/**//// <summary>
InBlock.gif        
/// Check if a certain 2 character token just came along (e.g. BT)
InBlock.gif        
/// </summary>
InBlock.gif        
/// <param name="search">the searched token</param>
InBlock.gif        
/// <param name="recent">the recent character array</param>
ExpandedSubBlockEnd.gif        
/// <returns></returns>

InBlock.gif        private bool CheckToken(string[] tokens, char[] recent)
ExpandedSubBlockStart.gifContractedSubBlock.gif        
dot.gif{
InBlock.gif            
foreach(string token in tokens)
ExpandedSubBlockStart.gifContractedSubBlock.gif            
dot.gif{
InBlock.gif                
if ((recent[_numberOfCharsToKeep - 3== token[0]) &&
InBlock.gif                    (recent[_numberOfCharsToKeep 
- 2== token[1]) &&
InBlock.gif                    ((recent[_numberOfCharsToKeep 
- 1== ' '||
InBlock.gif                    (recent[_numberOfCharsToKeep 
- 1== 0x0d||
InBlock.gif                    (recent[_numberOfCharsToKeep 
- 1== 0x0a)) &&
InBlock.gif                    ((recent[_numberOfCharsToKeep 
- 4== ' '||
InBlock.gif                    (recent[_numberOfCharsToKeep 
- 4== 0x0d||
InBlock.gif                    (recent[_numberOfCharsToKeep 
- 4== 0x0a))
InBlock.gif                    )
ExpandedSubBlockStart.gifContractedSubBlock.gif                
dot.gif{
InBlock.gif                    
return true;
ExpandedSubBlockEnd.gif                }

ExpandedSubBlockEnd.gif            }

InBlock.gif            
return false;
ExpandedSubBlockEnd.gif        }

ExpandedSubBlockEnd.gif        
#endregion

ExpandedSubBlockEnd.gif    }

ExpandedBlockEnd.gif}

None.gif

usage:
None.gifusing System;
None.gif
using System.Text;
None.gif
using System.IO;
None.gif
None.gif
namespace PdfToText
ExpandedBlockStart.gifContractedBlock.gif
dot.gif{
ExpandedSubBlockStart.gifContractedSubBlock.gif    
/**//// <summary>
InBlock.gif    
/// The main entry point to the program.
ExpandedSubBlockEnd.gif    
/// </summary>

InBlock.gif    class Program
ExpandedSubBlockStart.gifContractedSubBlock.gif    
dot.gif{
InBlock.gif        
static void Main(string[] args)
ExpandedSubBlockStart.gifContractedSubBlock.gif        
dot.gif{
InBlock.gif            
try
ExpandedSubBlockStart.gifContractedSubBlock.gif            
dot.gif{
InBlock.gif                
if (args.Length < 1)
ExpandedSubBlockStart.gifContractedSubBlock.gif                
dot.gif{
InBlock.gif                    DisplayUsage();
InBlock.gif                    
return;
ExpandedSubBlockEnd.gif                }

InBlock.gif
InBlock.gif                
string file = args[0];
InBlock.gif                
if (!File.Exists(file))
ExpandedSubBlockStart.gifContractedSubBlock.gif                
dot.gif{
InBlock.gif                    file 
= Path.GetFullPath(file);
InBlock.gif                    
if (!File.Exists(file))
ExpandedSubBlockStart.gifContractedSubBlock.gif                    
dot.gif{
InBlock.gif                        Console.WriteLine(
"Please give in the path to the PDF file.");
ExpandedSubBlockEnd.gif                    }

ExpandedSubBlockEnd.gif                }

InBlock.gif
InBlock.gif                PDFParser pdfParser 
= new PDFParser();
InBlock.gif                pdfParser.ExtractText(file, Path.GetFileNameWithoutExtension(file)
+".txt");
ExpandedSubBlockEnd.gif            }

InBlock.gif            
catch (Exception exc)
ExpandedSubBlockStart.gifContractedSubBlock.gif            
dot.gif{
InBlock.gif                Console.WriteLine(exc);
ExpandedSubBlockEnd.gif            }

ExpandedSubBlockEnd.gif        }

InBlock.gif
InBlock.gif        
static void DisplayUsage()
ExpandedSubBlockStart.gifContractedSubBlock.gif        
dot.gif{
InBlock.gif            Console.WriteLine();
InBlock.gif            Console.WriteLine(
"Usage:\tpdftotext FILE");
InBlock.gif            Console.WriteLine();
InBlock.gif            Console.WriteLine(
"\tFILE\t the path to the PDF file, it may be relative or absolute.");
InBlock.gif            Console.WriteLine();
ExpandedSubBlockEnd.gif        }

ExpandedSubBlockEnd.gif    }

ExpandedBlockEnd.gif}

None.gif

问题,不支持中文,没有布局,仅仅是把每页的所以文字抽取出来,如果想真正实现PDFtoTxt,仍然有好多路要走,但毕竟是个好的开始。

from http://www.codeproject.com/useritems/PDFToText.asp
 

本文转自RubyPdf 的中文博客博客园博客,原文链接:http://www.cnblogs.com/hardrock/archive/2006/06/16/427112.html/,如需转载请自行联系原作者

版权声明:本文内容由阿里云实名注册用户自发贡献,版权归原作者所有,阿里云开发者社区不拥有其著作权,亦不承担相应法律责任。具体规则请查看《阿里云开发者社区用户服务协议》和《阿里云开发者社区知识产权保护指引》。如果您发现本社区中有涉嫌抄袭的内容,填写侵权投诉表单进行举报,一经查实,本社区将立刻删除涉嫌侵权内容。

相关文章
阿里云服务器如何登录?阿里云服务器的三种登录方法
购买阿里云ECS云服务器后如何登录?场景不同,阿里云优惠总结大概有三种登录方式: 登录到ECS云服务器控制台 在ECS云服务器控制台用户可以更改密码、更换系.
28516 0
阿里云服务器ECS登录用户名是什么?系统不同默认账号也不同
阿里云服务器Windows系统默认用户名administrator,Linux镜像服务器用户名root
15944 0
阿里云服务器端口号设置
阿里云服务器初级使用者可能面临的问题之一. 使用tomcat或者其他服务器软件设置端口号后,比如 一些不是默认的, mysql的 3306, mssql的1433,有时候打不开网页, 原因是没有在ecs安全组去设置这个端口号. 解决: 点击ecs下网络和安全下的安全组 在弹出的安全组中,如果没有就新建安全组,然后点击配置规则 最后如上图点击添加...或快速创建.   have fun!  将编程看作是一门艺术,而不单单是个技术。
20388 0
腾讯云服务器 设置ngxin + fastdfs +tomcat 开机自启动
在tomcat中新建一个可以启动的 .sh 脚本文件 /usr/local/tomcat7/bin/ export JAVA_HOME=/usr/local/java/jdk7 export PATH=$JAVA_HOME/bin/:$PATH export CLASSPATH=.
14884 0
阿里云服务器怎么设置密码?怎么停机?怎么重启服务器?
如果在创建实例时没有设置密码,或者密码丢失,您可以在控制台上重新设置实例的登录密码。本文仅描述如何在 ECS 管理控制台上修改实例登录密码。
23548 0
阿里云服务器ECS远程登录用户名密码查询方法
阿里云服务器ECS远程连接登录输入用户名和密码,阿里云没有默认密码,如果购买时没设置需要先重置实例密码,Windows用户名是administrator,Linux账号是root,阿小云来详细说下阿里云服务器远程登录连接用户名和密码查询方法
22282 0
阿里云服务器如何登录?阿里云服务器的三种登录方法
购买阿里云ECS云服务器后如何登录?场景不同,云吞铺子总结大概有三种登录方式: 登录到ECS云服务器控制台 在ECS云服务器控制台用户可以更改密码、更换系统盘、创建快照、配置安全组等操作如何登录ECS云服务器控制台? 1、先登录到阿里云ECS服务器控制台 2、点击顶部的“控制台” 3、通过左侧栏,切换到“云服务器ECS”即可,如下图所示 通过ECS控制台的远程连接来登录到云服务器 阿里云ECS云服务器自带远程连接功能,使用该功能可以登录到云服务器,简单且方便,如下图:点击“远程连接”,第一次连接会自动生成6位数字密码,输入密码即可登录到云服务器上。
36381 0
+关注
385
文章
0
问答
文章排行榜
最热
最新
相关电子书
更多
JS零基础入门教程(上册)
立即下载
性能优化方法论
立即下载
手把手学习日志服务SLS,云启实验室实战指南
立即下载