1.前言
之前的博客《Lucene全文检索之HelloWorld》已经简单介绍了Lucene的索引生成和检索。本文着重介绍Lucene的索引删除。
2.应用场景:
索引建立完成后,因为有些原因,被索引的文件已经删除。此时,索引仍然存在,为了不产生“虚假检索结果”,需要将失效的索引删除
3.HelloLucene类(重点关注deleteIndexByQuery方法)
- package com.njupt.zhb;
- import java.io.BufferedReader;
- import java.io.File;
- import java.io.FileInputStream;
- import java.io.FileNotFoundException;
- import java.io.IOException;
- import java.io.InputStreamReader;
- import org.apache.lucene.analysis.Analyzer;
- import org.apache.lucene.analysis.standard.StandardAnalyzer;
- import org.apache.lucene.document.Document;
- import org.apache.lucene.document.Field;
- import org.apache.lucene.document.LongField;
- import org.apache.lucene.document.StringField;
- import org.apache.lucene.document.TextField;
- import org.apache.lucene.index.DirectoryReader;
- import org.apache.lucene.index.IndexReader;
- import org.apache.lucene.index.IndexWriter;
- import org.apache.lucene.index.IndexWriterConfig;
- import org.apache.lucene.index.IndexWriterConfig.OpenMode;
- import org.apache.lucene.index.Term;
- import org.apache.lucene.queryparser.classic.ParseException;
- import org.apache.lucene.queryparser.classic.QueryParser;
- import org.apache.lucene.search.IndexSearcher;
- import org.apache.lucene.search.Query;
- import org.apache.lucene.search.ScoreDoc;
- import org.apache.lucene.search.TopDocs;
- import org.apache.lucene.store.Directory;
- import org.apache.lucene.store.FSDirectory;
- import org.apache.lucene.util.Version;
- /*
- *@author: ZhengHaibo
- *web: http://blog.csdn.net/nuptboyzhb
- *mail: zhb931706659@126.com
- *2013-08-27 Nanjing,njupt,China
- */
- public class HelloLucene {
- /**
- * Index all text files under a directory.
- * String indexPath = "index";//索引保存的路径
- * String docsPath = "";//文档保存的路径(待索引)
- */
- public void index(String indexPath,String docsPath) {
- try {
- // 1.创建Directory
- Directory dir = FSDirectory.open(new File(indexPath));//保存在硬盘上
- // 2.创建IndexWriter
- Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44);
- IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_44,
- analyzer);
- iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);// 设置创建或追加模式
- IndexWriter writer = new IndexWriter(dir, iwc);
- final File docDir = new File(docsPath);
- indexDocs(writer, docDir);
- writer.close();
- } catch (IOException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- }
- public void indexDocs(IndexWriter writer, File file) throws IOException {
- if (file.canRead()) {
- if (file.isDirectory()) {//如果是文件夹,则遍历文件夹内的所有文件
- String[] files = file.list();
- // an IO error could occur
- if (files != null) {
- for (int i = 0; i < files.length; i++) {
- indexDocs(writer, new File(file, files[i]));
- }
- }
- } else {//如果是文件
- FileInputStream fis;
- try {
- fis = new FileInputStream(file);
- } catch (FileNotFoundException fnfe) {
- return;
- }
- try {
- // 3.创建Document对象
- Document doc = new Document();
- // 4.为Document添加Field
- // Add the path of the file as a field named "path". Use a
- // field that is indexed (i.e. searchable), but don't
- // tokenize
- // the field into separate words and don't index term
- // frequency
- // or positional information:
- //以文件的文件路径建立Field
- Field pathField = new StringField("path", file.getPath(),Field.Store.YES);
- doc.add(pathField);//添加到文档中
- //以文件的名称建立索引域
- doc.add( new StringField("filename", file.getName(),Field.Store.YES));//添加到文档中
- // Add the last modified date of the file a field named
- // "modified".
- // Use a LongField that is indexed (i.e. efficiently
- // filterable with
- // NumericRangeFilter). This indexes to milli-second
- // resolution, which
- // is often too fine. You could instead create a number
- // based on
- // year/month/day/hour/minutes/seconds, down the resolution
- // you require.
- // For example the long value 2011021714 would mean
- // February 17, 2011, 2-3 PM.
- doc.add(new LongField("modified", file.lastModified(),Field.Store.YES));
- // Add the contents of the file to a field named "contents".
- // Specify a Reader,
- // so that the text of the file is tokenized and indexed,
- // but not stored.
- // Note that FileReader expects the file to be in UTF-8
- // encoding.
- // If that's not the case searching for special characters
- // will fail.
- //以文件的内容建立索引域(Field)
- doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8"))));
- if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
- // New index, so we just add the document (no old
- // document can be there):
- System.out.println("adding " + file);
- writer.addDocument(doc);//将文档写入到索引中(以创建的方式)
- } else {
- // Existing index (an old copy of this document may have
- // been indexed) so
- // we use updateDocument instead to replace the old one
- // matching the exact
- // path, if present:
- System.out.println("updating " + file);
- writer.updateDocument(new Term("path", file.getPath()),doc);//以追加方式写入到索引中
- }
- } finally {
- fis.close();
- }
- }
- }
- }
- /**
- * 搜索
- * http://blog.csdn.net/nuptboyzhb
- */
- public void searcher(String indexPath,String searchKeyword){
- try {
- IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexPath)));
- IndexSearcher searcher = new IndexSearcher(reader);
- Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44);
- String field = "contents";//搜索域是:文档的内容
- QueryParser parser = new QueryParser(Version.LUCENE_44, field, analyzer);
- Query query= parser.parse(searchKeyword);//搜索内容中含有searchKeyword字符串的文档
- TopDocs tds=searcher.search(query, 10);//搜索前十个
- ScoreDoc[] sds= tds.scoreDocs;
- for (ScoreDoc sd:sds) {//将内容中含有“南京”关键字的文档遍历一遍
- Document document=searcher.doc(sd.doc);
- System.out.println("score:"+sd.score+"--filename:"+document.get("filename")+
- "--path:"+document.get("path")+"--time"+document.get("modified"));//打印检索结果中文档的路径
- }
- reader.close();
- } catch (IOException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }catch (ParseException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- }
- /**
- * 删除索引
- * @param indexPath 索引所在的路径
- * @param deleteKeyword 删除含有该内容的索引
- */
- public void deleteIndexByQuery(String indexPath,String deleteKeyword){
- try {
- //1.新建一个IndexWrite
- IndexWriter writer = new IndexWriter(FSDirectory.open(new File(indexPath)),new IndexWriterConfig(Version.LUCENE_44, new StandardAnalyzer(Version.LUCENE_44)));
- //2.生成一个Query
- Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44);
- String field = "contents";//搜索域是:文档的内容
- QueryParser parser = new QueryParser(Version.LUCENE_44, field, analyzer);
- Query query= parser.parse(deleteKeyword);//生成搜索内容中含有deleteKeyword的文档
- //3.按Query参数的方式删除索引,即删除了含有deleteKeyword的索引
- writer.deleteDocuments(query);
- writer.commit();//提交,正是删除
- writer.close();//关闭
- //
- //writer.deleteDocuments(new Term(field, ""));
- }catch (IOException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }catch (ParseException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- }
- }
4.编写Junit测试类
- package com.njupt.zhb;
- import org.junit.Test;
- /*
- *@author: ZhengHaibo
- *web: http://blog.csdn.net/nuptboyzhb
- *mail: zhb931706659@126.com
- *2013-08-25 Nanjing,njupt,China
- */
- public class TestJunit {
- @Test
- public void TestIndex(){
- HelloLucene hLucene=new HelloLucene();
- hLucene.index("index", "D:\\lucene");
- }
- @Test
- public void TestSearcher(){
- HelloLucene hLucene=new HelloLucene();
- hLucene.searcher("index","南京");
- }
- @Test
- public void TestDeleteIndexByQuery(){
- HelloLucene hLucene=new HelloLucene();
- System.out.println("未删除前,查询关键字:北京 --结果:");
- hLucene.searcher("index","北京");
- hLucene.deleteIndexByQuery("index", "北京");
- System.out.println("删除后,查询关键字:北京 --结果:");
- hLucene.searcher("index","北京");
- }
- }
5.实验结果
5.1运行TestIndex方法
>控制台打印的信息
- updating D:\lucene\lucene1.txt
- updating D:\lucene\lucene2.txt
- updating D:\lucene\lucene3.txt
- updating D:\lucene\北京.txt
- updating D:\lucene\南京.txt
此时的index目录下的截图:
5.2运行TestSearcher方法
>搜索含有关键字“南京”的文档
- score:0.53033006--filename:lucene3.txt--path:D:\lucene\lucene3.txt--time1376828819375
- score:0.48666292--filename:lucene2.txt--path:D:\lucene\lucene2.txt--time1376828783791
- score:0.2155931--filename:北京.txt--path:D:\lucene\北京.txt--time1377784223795
- score:0.1530931--filename:南京.txt--path:D:\lucene\南京.txt--time1377784261486
5.3运行TestDeleteIndexByQuery方法
>
- 未删除前,查询关键字:北京 --结果:
- score:0.4847152--filename:lucene2.txt--path:D:\lucene\lucene2.txt--time1376828783791
- score:0.39226472--filename:北京.txt--path:D:\lucene\北京.txt--time1377784223795
- score:0.10348864--filename:lucene3.txt--path:D:\lucene\lucene3.txt--time1376828819375
- score:0.029874597--filename:南京.txt--path:D:\lucene\南京.txt--time1377784261486
- 删除后,查询关键字:北京 --结果:
此时,index目录下的文件结构为:
多出了一个_0_1.del文件
项目源代码:http://download.csdn.net/detail/nuptboyzhb/6041239
未经允许,不得用于商业目的