需求说明:给定OR期刊文章的PDF文档,提取我们感兴趣的文章标题、作者、作者机构、关键字、接受日期、审稿日期、录用日期、出版日期、审稿分区等信息
提取效果:
开放工具:VS2010+Spire Pdf插件
需要组件:按钮(btn_or2018)、文本框(textbox1)
设计思想:
(1)读入pdf文档
PdfDocument pdf = new PdfDocument();
pdf.LoadFromFile(fileName);
PdfPageBase page = pdf.Pages[1];//本次下载的OR文章文章正文第一页页码为 2
(2)首先使用,page.FindText("OPERATIONS RESEARCH").Finds;函数确定是否为OR期刊
(3)使用page.ExtractText函数进行关键标识词定位
(3)根据定位结果,进行相关位置的字符串提取(主要涉及函数SubString)与显示(textbox控件)
源代码:
1. 打开文档
private void btn_or2018_Click(object sender, EventArgs e) { OpenFileDialog ofd = new OpenFileDialog(); ofd.Filter = "PDF文档(*.pdf)| *.pdf"; ofd.ShowDialog(); string txtPath = ofd.FileName; PaperInformation_OR paperOR = new PaperInformation_OR().getPaperInfo(txtPath); textBox1.Text = paperOR.ToString(); }
2. 定义PaperInformation_OR类
using System; using System.Collections.Generic; using System.Linq; using System.Text; using Spire.Pdf; using Spire.License; using System.Drawing; using System.Windows.Forms; using System.Reflection; using System.Data; namespace OperationsResearch { class PaperInformation_OR { #region 类的成员变量 public string m_title; public string m_author; public string m_organization; public string m_contact; public string m_receivedDate; public string m_revisedDate; public string m_acceptedDate; public string m_publishedDate; public string m_areaOfRiview; public string m_subjectClassfication; public string m_doi; public string m_year; public string m_volume; public string m_issue; public string m_page; #endregion #region 构造函数 public PaperInformation_OR(string Title, string Author, string Organization, string Contact, string Received, string Revised, string Accepted, string Published, string year, string Volume, string Issue, string Page, string SubjectClassification, string AreaOfReview, string Doi) { m_title = Title; m_author = Author; m_organization = Organization; m_contact = Contact; m_receivedDate = Received; m_revisedDate = Revised; m_acceptedDate = Accepted; m_publishedDate = Published; m_year = year; m_volume = Volume; m_issue = Issue; m_page = Page; m_subjectClassfication = SubjectClassification; m_areaOfRiview = AreaOfReview; m_doi = Doi; } public PaperInformation_OR() {} #endregion //重写ToString函数,输出类内所有成员变量的值 override public string ToString() { string tmp = "题目:" + m_title + "\r\n作者:" + m_author + "\r\n作者单位:" + m_organization + "\r\n作者联系方式:" + m_contact + "\r\n" + m_receivedDate + "\r\n" + m_revisedDate + "\r\n" + m_acceptedDate + "\r\n" + m_publishedDate + "\r\n年:" + m_year + "\r\n卷:" + m_volume + "\r\n期:" + m_issue + "\r\n页码:" + m_page + "\r\ndoi:" + m_doi; return tmp; } //根据出版时间计算文章是第几期 public int getIssue(string PublishTime) { string PublishMonth = PublishTime.Split(' ')[0].Split(':')[1]; switch (PublishMonth) { case "January": case "February": return 1; case "March": case "April": return 2; case "May": case "June": return 3; case "July": case "August": return 4; case "September": case "October": return 5; case "November": case "December": return 6; default: break; } return 0; } //主要用来处理作者单位空格无法识别问题,在字符串中的大写字母处添加空格 public string dealString(string str) { str = str.Replace("and", " and").Replace("of", " of"); StringBuilder strBuff = new StringBuilder(); for (int i = 0; i < str.Length; i++) { //在大写字母之前加空格 if (Char.IsUpper(str[i])) { strBuff.Append(' '); } //在数字之前加空格 if (i > 0 && Char.IsLetter(str[i - 1]) && Char.IsDigit(str[i])) { strBuff.Append(' '); } strBuff.Append(str[i]); } return strBuff.ToString(); } #region 字符串中多个连续空格转为一个空格 /// <summary> /// 字符串中多个连续空格转为一个空格 /// </summary> /// <param name="str">待处理的字符串</param> /// <returns>合并空格后的字符串</returns> public string MergeSpace(string str) { if (str != string.Empty && str != null && str.Length > 0 ) { str = new System.Text.RegularExpressions.Regex("[\\s]+").Replace(str, " "); } return str; } #endregion //根据文献名提取文献信息 public PaperInformation_OR getPaperInfo(string fileName) { PdfDocument pdf = new PdfDocument(); pdf.LoadFromFile(fileName); PdfPageBase page = pdf.Pages[1]; //从第一页的指定矩形区域内提取文本,并剔除其中的换行符 //收稿日期的左上坐标 var pt = page.FindText("Received").Finds[0]; System.Drawing.Point acceptedPos = new System.Drawing.Point((int)pt.Position.X, (int)pt.Position.Y); //摘要部分的坐上坐标 var pt1 = page.FindText("Abstract").Finds[0]; System.Drawing.Point abstarctPos = new System.Drawing.Point((int)pt1.Position.X, (int)pt1.Position.Y); //版权部分的左上坐标 var pt2 = page.FindText("Copyright").Finds[0]; System.Drawing.Point CopyrightPos = new System.Drawing.Point((int)pt2.Position.X, (int)pt2.Position.Y); //---------------------------------获取接受日期到版权页之间的内容--------------------------- string paperInfo = page.ExtractText(new RectangleF(acceptedPos.X, acceptedPos.Y, abstarctPos.X - acceptedPos.X, CopyrightPos.Y - acceptedPos.Y)).Replace("\r\n", ""); //提取接收时间 string ReceivedTime = paperInfo.Substring(paperInfo.IndexOf("Received"), paperInfo.IndexOf("Revised") - paperInfo.IndexOf("Received")); //提取审稿时间 string RevisedTime = paperInfo.Substring(paperInfo.IndexOf("Revised"), paperInfo.IndexOf("Accepted") - paperInfo.IndexOf("Revised")); //提取接受时间 string AcceptedTime = paperInfo.Substring(paperInfo.IndexOf("Accepted"), paperInfo.IndexOf("Published") - paperInfo.IndexOf("Accepted")); //提取目标分类 string SubjectClassification = paperInfo.Substring(paperInfo.IndexOf("Subject"), paperInfo.IndexOf("Area") - paperInfo.IndexOf("Subject")); //提取审稿领域 string AreaOfReview = paperInfo.Substring(paperInfo.IndexOf("Area"), paperInfo.IndexOf("https") - paperInfo.IndexOf("Area")); //提取doi链接 string doi = paperInfo.Substring(paperInfo.IndexOf("https"), paperInfo.IndexOf("Copyright") - paperInfo.IndexOf("https")); //提取出版年--由doi号推导 string year = doi.Substring(doi.Length - 9, 4); //提取当前卷信息--第一卷年份为1952年 string volume = (int.Parse(year) - 1952).ToString(); //提取文章期信息 string PublishTime = paperInfo.Substring(paperInfo.IndexOf("Advance:"), paperInfo.IndexOf("Subject") - paperInfo.IndexOf("Advance:")); int issue = getIssue(PublishTime); //-------------------------------获取论文页码、作者、作者单位、通讯方式-------------------- //获取论文页数(默认没有三位数页码的单篇论文) var pp = page.FindText("pp.").Finds[0]; System.Drawing.Point pageCntPos = new System.Drawing.Point((int)pp.Position.X, (int)pp.Position.Y); string paperPageCnt = page.ExtractText(new RectangleF(pageCntPos.X, pageCntPos.Y, 50, 10)).Replace("\r\n", ""); //获取论文作者及单位信息 string paperTitleAndAuthor = page.ExtractText(new RectangleF(acceptedPos.X, pageCntPos.Y + 20, 550, abstarctPos.Y - pageCntPos.Y - 30)).Trim(); //剥离题目及作者----先通过\r\na剥离得到题目和作者组合,然后再识别最后一个换行符进一步剥离作者 string titleAndAuthor = paperTitleAndAuthor.Substring(0, paperTitleAndAuthor.IndexOf("\r\na") - 2); string paperTitle = titleAndAuthor.Substring(0, titleAndAuthor.LastIndexOf("\r\n") - 2).Replace("\r\n", " "); string paperAuthor = titleAndAuthor.Substring(titleAndAuthor.LastIndexOf("\r\n"), titleAndAuthor.Length - titleAndAuthor.LastIndexOf("\r\n")).Replace("\r\n", " "); //剥离作者单位 string authorOrganization = paperTitleAndAuthor.Substring(paperTitleAndAuthor.IndexOf("\r\na") + 2, paperTitleAndAuthor.IndexOf("Contact:") - paperTitleAndAuthor.IndexOf("\r\na") - 3).Replace("\r\n", " "); authorOrganization = dealString(authorOrganization).Trim(); //剥离作者的联系方式 string contactAuthor = paperTitleAndAuthor.Substring(paperTitleAndAuthor.IndexOf("Contact:"), paperTitleAndAuthor.Length - paperTitleAndAuthor.IndexOf("Contact:")).Replace("\r\n", " "); //---------------------------------输出到paperInfo结构体中-------------------------------- return new PaperInformation_OR(paperTitle, paperAuthor, authorOrganization, contactAuthor, ReceivedTime, ReceivedTime, AcceptedTime, PublishTime, year, volume, issue.ToString(),paperPageCnt, SubjectClassification, AreaOfReview, doi); } public PaperInformation_OR getPaperInfo2015(string fileName) { PdfDocument pdf = new PdfDocument(); pdf.LoadFromFile(fileName); PdfPageBase page = pdf.Pages[1]; var orLogo = page.FindText("OPERATIONS RESEARCH").Finds; if (orLogo.Length == 0) { MessageBox.Show("该文档非OR期刊"); return new PaperInformation_OR(); } //获取文献 //获取-年-卷-期-页码信息 var pp = page.FindText("Vol.").Finds[0]; if (pp == null) { MessageBox.Show("该文档非标准2015格式"); return new PaperInformation_OR(); } System.Drawing.Point pageCntPos = new System.Drawing.Point((int)pp.Position.X, (int)pp.Position.Y); string paperYearVolIssuePage = page.ExtractText(new RectangleF(pageCntPos.X, pageCntPos.Y, 200, 10)); string[] tmp1 = paperYearVolIssuePage.Split(','); //卷-期-年-页码 string paperVolume = tmp1[0]; string paperIssue = tmp1[1]; string paperYear = tmp1[2].Substring(tmp1[2].Length - 5, 5); string paperPage = tmp1[3].Replace("\r\n", ""); //获取文章网址 var doi = page.FindText("http:").Finds[0]; System.Drawing.Point doiPos = new System.Drawing.Point((int)doi.Position.X, (int)doi.Position.Y); string paperDoi = page.ExtractText(new RectangleF(doiPos.X, doiPos.Y, 150, 7)).Trim(); //获取文章其它信息 var hisInfo = page.FindText("History:").Finds; if (hisInfo.Length == 0) { MessageBox.Show("该文档非标准2015格式,定位日期信息错误"); return new PaperInformation_OR(); } var history = page.FindText("History:").Finds[0]; System.Drawing.Point historyPos = new System.Drawing.Point((int)history.Position.X, (int)history.Position.Y); string paperOthers = page.ExtractText(new RectangleF(historyPos.X, doiPos.Y + 20, 520, historyPos.Y - doiPos.Y)).Trim(); //----------------------------------------------------------------------------------------------- //获取文章题目 int titleIndex = paperOthers.IndexOf("\r\n\r\n\r\n\r\n"); string paperTitle, paperAuthor = " ", paperAuthorCompany = " ", paperContact = " ", paperPublish = "出版日期"; if (titleIndex > 0) { string paperTitle2 = paperOthers.Substring(0, paperOthers.IndexOf("\r\n\r\n\r\n")).Replace("\r\n", ""); paperTitle = paperTitle2; } else { string paperTitle2 = paperOthers.Substring(0, paperOthers.IndexOf("\r\n\r\n\r\n")).Replace("\r\n", ""); paperTitle = MergeSpace(paperTitle2); } //----------------------------------------------------------------------------------------------- //文章分类信息 int scIndex = paperOthers.IndexOf("Subject classiflcations:"); int aorIndex = paperOthers.IndexOf("Area of review"); int hisIndex = paperOthers.IndexOf("History:"); string paperSubClass = paperOthers.Substring(scIndex, aorIndex - scIndex).Replace("\r\n", "").Trim(); //文章审稿区域 string paperAreaOfReview = paperOthers.Substring(aorIndex, hisIndex - aorIndex).Replace("\r\n", "").Trim(); //得到投稿、审稿、接受信息 string paperDateInfo = paperOthers.Substring(hisIndex, paperOthers.Length - hisIndex).Replace("\r\n", "").Trim(); string[] tmp3 = paperDateInfo.Split(':')[1].Split(';'); string paperReceivedDate = tmp3[0]; string paperRevisedDate = tmp3[1]; string paperAcceptedDate = tmp3[2]; return new PaperInformation_OR(paperTitle,paperAuthor, paperAuthorCompany, paperContact, paperReceivedDate, paperRevisedDate, paperAcceptedDate, paperPublish, paperYear, paperVolume, paperIssue, paperPage, paperSubClass, paperAreaOfReview, paperDoi); } } }