Nutch集成Solr中文分词Schema

简介:

 <?xml version="1.0" encoding="UTF-8" ?>

<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
license agreements. See the NOTICE file distributed with this work for additional 
information regarding copyright ownership. The ASF licenses this file to 
You under the Apache License, Version 2.0 (the "License"); you may not use 
this file except in compliance with the License. You may obtain a copy of 
the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
by applicable law or agreed to in writing, software distributed under the 
License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
OF ANY KIND, either express or implied. See the License for the specific 
language governing permissions and limitations under the License. -->
<!-- Description: This document contains Solr 3.1 schema definition to be 
used with Solr integration currently build into Nutch. See https://issues.apache.org/jira/browse/NUTCH-442 
https://issues.apache.org/jira/browse/NUTCH-699 https://issues.apache.org/jira/browse/NUTCH-994 
https://issues.apache.org/jira/browse/NUTCH-997 and http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/ 
example/solr/conf/schema.xml?view=markup for more info. -->
<schema name="nutch" version="1.3">
<types>
<fieldType name="string" class="solr.StrField"
sortMissingLast="true" omitNorms="true" />
<fieldType name="long" class="solr.TrieLongField"
precisionStep="0" omitNorms="true" positionIncrementGap="0" />
<fieldType name="float" class="solr.TrieFloatField"
precisionStep="0" omitNorms="true" positionIncrementGap="0" />
<fieldType name="date" class="solr.TrieDateField"
precisionStep="0" omitNorms="true" positionIncrementGap="0" />
 
<fieldType name="cache_text" class="solr.TextField"
positionIncrementGap="100">
</fieldType>
 
<fieldType name="text" class="solr.TextField"
positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="com.chenlb.mmseg4j.solr.MMSegTokenizerFactory"
mode="complex" dicPath="dic" />
<filter class="solr.LowerCaseFilterFactory" />
</analyzer>
<analyzer type="query">
<tokenizer class="com.chenlb.mmseg4j.solr.MMSegTokenizerFactory"
mode="complex" dicPath="dic" />
<filter class="solr.LowerCaseFilterFactory" />
</analyzer>
</fieldType>
<fieldType name="url" class="solr.TextField"
positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory" />
<filter class="solr.LowerCaseFilterFactory" />
<filter class="solr.WordDelimiterFilterFactory"
generateWordParts="1" generateNumberParts="1" />
</analyzer>
</fieldType>
</types>
<fields>
<field name="id" type="string" stored="true" indexed="true" />
 
<!-- core fields -->
<field name="segment" type="string" stored="true" indexed="false" />
<field name="digest" type="string" stored="true" indexed="false" />
<field name="boost" type="float" stored="true" indexed="false" />
 
<!-- fields for index-basic plugin -->
<field name="host" type="url" stored="false" indexed="true" />
<field name="site" type="string" stored="false" indexed="true" />
<field name="url" type="url" stored="true" indexed="true"
required="true" />
<field name="content" type="text" stored="false" indexed="true" />
<field name="title" type="text" stored="true" indexed="true" />
<field name="cache" type="string" stored="true" indexed="false" />
<field name="cache_content" type="cache_text" stored="true"
indexed="false" />
<field name="tstamp" type="date" stored="true" indexed="true" />
 
<!-- fields for index-anchor plugin -->
<field name="anchor" type="string" stored="true" indexed="true"
multiValued="true" />
 
<!-- fields for index-more plugin -->
<field name="type" type="string" stored="true" indexed="true"
multiValued="true" />
<field name="contentLength" type="long" stored="true" indexed="false" />
<field name="lastModified" type="date" stored="true" indexed="false" />
<field name="date" type="date" stored="true" indexed="true" />
 
<!-- fields for languageidentifier plugin -->
<field name="lang" type="string" stored="true" indexed="true" />
 
<!-- fields for subcollection plugin -->
<field name="subcollection" type="string" stored="true" indexed="true"
multiValued="true" />
 
<!-- fields for feed plugin (tag is also used by microformats-reltag) -->
<field name="author" type="string" stored="true" indexed="true" />
<field name="tag" type="string" stored="true" indexed="true"
multiValued="true" />
<field name="feed" type="string" stored="true" indexed="true" />
<field name="publishedDate" type="date" stored="true" indexed="true" />
<field name="updatedDate" type="date" stored="true" indexed="true" />
 
<!-- fields for creativecommons plugin -->
<field name="cc" type="string" stored="true" indexed="true"
multiValued="true" />
</fields>
<uniqueKey>id</uniqueKey>
<defaultSearchField>content</defaultSearchField>
<solrQueryParser defaultOperator="OR" />
</schema>
 

本文转自william_xu 51CTO博客,原文链接:http://blog.51cto.com/williamx/773815,如需转载请自行联系原作者
相关文章
|
XML 存储 JSON
|
自然语言处理 Java 索引
全文检索Solr集成HanLP中文分词
以前发布过HanLP的Lucene插件,后来很多人跟我说其实Solr更流行(反正我是觉得既然Solr是Lucene的子项目,那么稍微改改配置就能支持Solr),于是就抽空做了个Solr插件出来,开源在Github上,欢迎改进。
3364 0
|
搜索推荐 Java 应用服务中间件
.net软件xcopy形式集成solr搜索引擎
Solr 是基于Luncene的开源企业搜索服务,提供了一个打包即用的解决方案[使用Luncene集成化需要处理索引管理、分析器等一系列的问题,自己实现还是相对麻烦的],solr对外以Http协议服务,增加了很多的辅助功能,核心集成luncene.
1020 0
|
2月前
|
监控 druid Java
spring boot 集成配置阿里 Druid监控配置
spring boot 集成配置阿里 Druid监控配置
186 6
|
2月前
|
Java 关系型数据库 MySQL
如何实现Springboot+camunda+mysql的集成
【7月更文挑战第2天】集成Spring Boot、Camunda和MySQL的简要步骤: 1. 初始化Spring Boot项目,添加Camunda和MySQL驱动依赖。 2. 配置`application.properties`,包括数据库URL、用户名和密码。 3. 设置Camunda引擎属性,指定数据源。 4. 引入流程定义文件(如`.bpmn`)。 5. 创建服务处理流程操作,创建控制器接收请求。 6. Camunda自动在数据库创建表结构。 7. 启动应用,测试流程启动,如通过服务和控制器开始流程实例。 示例代码包括服务类启动流程实例及控制器接口。实际集成需按业务需求调整。
208 4
|
2月前
|
消息中间件 Java 测试技术
【RocketMQ系列八】SpringBoot集成RocketMQ-实现普通消息和事务消息
【RocketMQ系列八】SpringBoot集成RocketMQ-实现普通消息和事务消息
163 1
|
3月前
|
消息中间件 Java Kafka
springboot集成kafka
springboot集成kafka
124 2
|
2月前
|
消息中间件 Java Kafka
Spring Boot与Apache Kafka Streams的集成
Spring Boot与Apache Kafka Streams的集成