在本节中,我将会介绍索引文件sph的生成,从上一节我们得知sph文件保存了Sphinx的索引元信息以及一些索引相关的配置信息
SPH文件生成
先来看代码,其中sph文件的生成是在CSphIndex_VLN::WriteHeader这个函数中:
bool CSphIndex_VLN::WriteHeader ( const BuildHeader_t & tBuildHeader, CSphWriter & fdInfo ) const
{
// version
fdInfo.PutDword ( INDEX_MAGIC_HEADER );
fdInfo.PutDword ( INDEX_FORMAT_VERSION );
// bits
fdInfo.PutDword ( USE_64BIT );
// docinfo
fdInfo.PutDword ( m_tSettings.m_eDocinfo );
// schema
WriteSchema ( fdInfo, m_tSchema );
// min doc
fdInfo.PutOffset ( tBuildHeader.m_uMinDocid ); // was dword in v.1
if ( m_tSettings.m_eDocinfo==SPH_DOCINFO_INLINE )
fdInfo.PutBytes ( tBuildHeader.m_pMinRow, m_tSchema.GetRowSize()*sizeof(CSphRowitem) );
// wordlist checkpoints
fdInfo.PutOffset ( tBuildHeader.m_iDictCheckpointsOffset );
fdInfo.PutDword ( tBuildHeader.m_iDictCheckpoints );
fdInfo.PutByte ( tBuildHeader.m_iInfixCodepointBytes );
fdInfo.PutDword ( (DWORD)tBuildHeader.m_iInfixBlocksOffset );
fdInfo.PutDword ( tBuildHeader.m_iInfixBlocksWordsSize );
// index stats
fdInfo.PutDword ( (DWORD)tBuildHeader.m_iTotalDocuments ); // FIXME? we don't expect over 4G docs per just 1 local index
fdInfo.PutOffset ( tBuildHeader.m_iTotalBytes );
fdInfo.PutDword ( tBuildHeader.m_iTotalDups );
// index settings
SaveIndexSettings ( fdInfo, m_tSettings );
// tokenizer info
assert ( m_pTokenizer );
SaveTokenizerSettings ( fdInfo, m_pTokenizer, m_tSettings.m_iEmbeddedLimit );
// dictionary info
assert ( m_pDict );
SaveDictionarySettings ( fdInfo, m_pDict, false, m_tSettings.m_iEmbeddedLimit );
fdInfo.PutDword ( tBuildHeader.m_uKillListSize );
fdInfo.PutOffset ( tBuildHeader.m_iMinMaxIndex );
// field filter info
SaveFieldFilterSettings ( fdInfo, m_pFieldFilter );
// average field lengths
if ( m_tSettings.m_bIndexFieldLens )
ARRAY_FOREACH ( i, m_tSchema.m_dFields )
fdInfo.PutOffset ( m_dFieldLens[i] );
return true;
}
然后按顺序来解释下每一项字段的含义.
- 前两个字段INDEX_MAGIC_HEADER和INDEX_FORMAT_VERSION分别是magic number和索引版本号
- 第三个字段USE_64BIT表示是否使用64位的document和word id(默认是使用).
- 然后是写入docinfo,这个字段也就是配置中的docinfo字段(index block中)
- 接下来将会写入schema,也就是索引的schema信息,比如当前索引的字段名,当前需要建立的属性名等等.
void WriteSchema ( CSphWriter & fdInfo, const CSphSchema & tSchema )
{
// schema
fdInfo.PutDword ( tSchema.m_dFields.GetLength() );
ARRAY_FOREACH ( i, tSchema.m_dFields )
WriteSchemaColumn ( fdInfo, tSchema.m_dFields[i] );
fdInfo.PutDword ( tSchema.GetAttrsCount() );
for ( int i=0; i<tSchema.GetAttrsCount(); i++ )
WriteSchemaColumn ( fdInfo, tSchema.GetAttr(i) );
}
- 然后是写入当前索引集的最小doc id(m_uMinDocid)
- 接下来是根据docinfo(也就是属性存储)的配置来选择是否写入行信息(当docinfo为inline的话,表示attribute value 将会存储在spd文件中).
- 然后是写入wordlist的checkpoint.
- 然后是索引的统计信息(m_iTotalDocuments/m_iTotalBytes/m_iTotalDups).
- 接下来是写入对应的索引配置信息
void SaveIndexSettings ( CSphWriter & tWriter, const CSphIndexSettings & tSettings )
{
tWriter.PutDword ( tSettings.m_iMinPrefixLen );
tWriter.PutDword ( tSettings.m_iMinInfixLen );
tWriter.PutDword ( tSettings.m_iMaxSubstringLen );
tWriter.PutByte ( tSettings.m_bHtmlStrip ? 1 : 0 );
tWriter.PutString ( tSettings.m_sHtmlIndexAttrs.cstr () );
tWriter.PutString ( tSettings.m_sHtmlRemoveElements.cstr () );
tWriter.PutByte ( tSettings.m_bIndexExactWords ? 1 : 0 );
tWriter.PutDword ( tSettings.m_eHitless );
tWriter.PutDword ( tSettings.m_eHitFormat );
tWriter.PutByte ( tSettings.m_bIndexSP );
tWriter.PutString ( tSettings.m_sZones );
tWriter.PutDword ( tSettings.m_iBoundaryStep );
tWriter.PutDword ( tSettings.m_iStopwordStep );
tWriter.PutDword ( tSettings.m_iOvershortStep );
tWriter.PutDword ( tSettings.m_iEmbeddedLimit );
tWriter.PutByte ( tSettings.m_eBigramIndex );
tWriter.PutString ( tSettings.m_sBigramWords );
tWriter.PutByte ( tSettings.m_bIndexFieldLens );
tWriter.PutByte ( tSettings.m_eChineseRLP );
tWriter.PutString ( tSettings.m_sRLPContext );
tWriter.PutString ( tSettings.m_sIndexTokenFilter );
}
- 写入对应的tokenizer的配置信息,
void SaveTokenizerSettings ( CSphWriter & tWriter, ISphTokenizer * pTokenizer, int iEmbeddedLimit )
{
assert ( pTokenizer );
const CSphTokenizerSettings & tSettings = pTokenizer->GetSettings ();
tWriter.PutByte ( tSettings.m_iType );
tWriter.PutString ( tSettings.m_sCaseFolding.cstr () );
tWriter.PutDword ( tSettings.m_iMinWordLen );
bool bEmbedSynonyms = pTokenizer->GetSynFileInfo ().m_uSize<=(SphOffset_t)iEmbeddedLimit;
tWriter.PutByte ( bEmbedSynonyms ? 1 : 0 );
if ( bEmbedSynonyms )
pTokenizer->WriteSynonyms ( tWriter );
tWriter.PutString ( tSettings.m_sSynonymsFile.cstr () );
WriteFileInfo ( tWriter, pTokenizer->GetSynFileInfo () );
tWriter.PutString ( tSettings.m_sBoundary.cstr () );
tWriter.PutString ( tSettings.m_sIgnoreChars.cstr () );
tWriter.PutDword ( tSettings.m_iNgramLen );
tWriter.PutString ( tSettings.m_sNgramChars.cstr () );
tWriter.PutString ( tSettings.m_sBlendChars.cstr () );
tWriter.PutString ( tSettings.m_sBlendMode.cstr () );
}
- 写入dictionary的配置信息(比如stop word之类).
void SaveDictionarySettings ( CSphWriter & tWriter, CSphDict * pDict, bool bForceWordDict, int iEmbeddedLimit )
{
assert ( pDict );
const CSphDictSettings & tSettings = pDict->GetSettings ();
tWriter.PutString ( tSettings.m_sMorphology.cstr () );
.............................
bool bEmbedStopwords = uTotalSize<=(SphOffset_t)iEmbeddedLimit;
tWriter.PutByte ( bEmbedStopwords ? 1 : 0 );
if ( bEmbedStopwords )
pDict->WriteStopwords ( tWriter );
tWriter.PutString ( tSettings.m_sStopwords.cstr () );
tWriter.PutDword ( dSWFileInfos.GetLength () );
ARRAY_FOREACH ( i, dSWFileInfos )
{
tWriter.PutString ( dSWFileInfos[i].m_sFilename.cstr () );
WriteFileInfo ( tWriter, dSWFileInfos[i] );
}
const CSphVector <CSphSavedFile> & dWFFileInfos = pDict->GetWordformsFileInfos ();
uTotalSize = 0;
ARRAY_FOREACH ( i, dWFFileInfos )
uTotalSize += dWFFileInfos[i].m_uSize;
bool bEmbedWordforms = uTotalSize<=(SphOffset_t)iEmbeddedLimit;
tWriter.PutByte ( bEmbedWordforms ? 1 : 0 );
if ( bEmbedWordforms )
pDict->WriteWordforms ( tWriter );
tWriter.PutDword ( dWFFileInfos.GetLength() );
ARRAY_FOREACH ( i, dWFFileInfos )
{
tWriter.PutString ( dWFFileInfos[i].m_sFilename.cstr() );
WriteFileInfo ( tWriter, dWFFileInfos[i] );
}
tWriter.PutDword ( tSettings.m_iMinStemmingLen );
tWriter.PutByte ( tSettings.m_bWordDict || bForceWordDict );
tWriter.PutByte ( tSettings.m_bStopwordsUnstemmed );
tWriter.PutString ( pDict->GetMorphDataFingerprint() );
}
- 然后是写入killlist的size(m_uKillListSize)
- 写入m_iMinMaxIndex,这个选项也就是表示document size.
CSphFixedVector<CSphRowitem> dMinRow ( tNewSchema.GetRowSize() );
...............
int iNewStride = DOCINFO_IDSIZE + tNewSchema.GetRowSize();
int64_t iNewMinMaxIndex = m_iDocinfo * iNewStride;
..............................
tBuildHeader.m_iMinMaxIndex = iNewMinMaxIndex;
- 写入regex相关配置(regexp_filter)
void SaveFieldFilterSettings ( CSphWriter & tWriter, ISphFieldFilter * pFieldFilter )
{
if ( !pFieldFilter )
{
tWriter.PutDword ( 0 );
return;
}
CSphFieldFilterSettings tSettings;
pFieldFilter->GetSettings ( tSettings );
tWriter.PutDword ( tSettings.m_dRegexps.GetLength() );
ARRAY_FOREACH ( i, tSettings.m_dRegexps )
tWriter.PutString ( tSettings.m_dRegexps[i] );
tWriter.PutByte(1); // deprecated utf8 flag
}
- 最后是写入对应的schema field长度.