中文分词
ChineseParse.cs
using System;
using System.Collections;
using System.IO;
using System.Text.RegularExpressions;
namespace FullTextSearch.Common
{
/// <summary>
/// 中文分词器。
/// </summary>
public class ChineseParse
{
private static readonly ChineseWordsHashCountSet _countTable;
static ChineseParse()
{
_countTable = new ChineseWordsHashCountSet();
InitFromFile("ChineseDictionary.txt");
}
/// <summary>
/// 从指定的文件中初始化中文词语字典和字符串次数字典。
/// </summary>
/// <param name="fileName">文件名</param>
private static void InitFromFile(string fileName)
{
string path = Path.Combine(Directory.GetCurrentDirectory(), @"..\..\Common\", fileName);
if (File.Exists(path))
{
using (StreamReader sr = File.OpenText(path))
{
string s = "";
while ((s = sr.ReadLine()) != null)
{
ChineseWordUnit _tempUnit = InitUnit(s);
_countTable.InsertWord(_tempUnit.Word);
}
}
}
}
/// <summary>
/// 将一个字符串解析为ChineseWordUnit。
/// </summary>
/// <param name="s">字符串</param>
/// <returns>解析得到的ChineseWordUnit</returns>
/// 4
/// 0
private static ChineseWordUnit InitUnit(string s)
{
var reg = new Regex(@"\s+");
string[] temp = reg.Split(s);
//if (temp.Length != 2)
//{
// throw new Exception("字符串解析错误:" + s);
//}
if (temp.Length != 1)
{
throw new Exception("字符串解析错误:" + s);
}
return new ChineseWordUnit(temp[0], Int32.Parse("1"));
}
/// <summary>
/// 分析输入的字符串,将其切割成一个个的词语。
/// </summary>
/// <param name="s">待切割的字符串</param>
/// <returns>所切割得到的中文词语数组</returns>
public static string[] ParseChinese(string s)
{
int _length = s.Length;
string _temp = String.Empty;
var _words = new ArrayList();
for (int i = 0; i < s.Length;)
{
_temp = s.Substring(i, 1);
if (_countTable.GetCount(_temp) > 1)
{
int j = 2;
for (; i + j < s.Length + 1 && _countTable.GetCount(s.Substring(i, j)) > 0; j++)
{
}
_temp = s.Substring(i, j - 1);
i = i + j - 2;
}
i++;
_words.Add(_temp);
}
var _tempStringArray = new string[_words.Count];
_words.CopyTo(_tempStringArray);
return _tempStringArray;
}
}
}
ChineseWordsHashCountSet.cs
using System.Collections;
namespace FullTextSearch.Common
{
/// <summary>
/// 记录字符串出现在中文字典所录中文词语的前端的次数的字典类。如字符串"中"出现在"中国"的前端,则在字典中记录一个次数。
/// </summary>
public class ChineseWordsHashCountSet
{
/// <summary>
/// 记录字符串在中文词语中出现次数的Hashtable。键为特定的字符串,值为该字符串在中文词语中出现的次数。
/// </summary>
private readonly Hashtable _rootTable;
/// <summary>
/// 类型初始化。
/// </summary>
public ChineseWordsHashCountSet()
{
_rootTable = new Hashtable();
}
/// <summary>
/// 查询指定字符串出现在中文字典所录中文词语的前端的次数。
/// </summary>
/// <param name="s">指定字符串</param>
/// <returns>字符串出现在中文字典所录中文词语的前端的次数。若为-1,表示不出现。</returns>
public int GetCount(string s)
{
if (!_rootTable.ContainsKey(s.Length))
{
return -1;
}
var _tempTable = (Hashtable) _rootTable[s.Length];
if (!_tempTable.ContainsKey(s))
{
return -1;
}
return (int) _tempTable[s];
}
/// <summary>
/// 向次数字典中插入一个词语。解析该词语,插入次数字典。
/// </summary>
/// <param name="s">所处理的字符串。</param>
public void InsertWord(string s)
{
for (int i = 0; i < s.Length; i++)
{
string _s = s.Substring(0, i + 1);
InsertSubString(_s);
}
}
/// <summary>
/// 向次数字典中插入一个字符串的次数记录。
/// </summary>
/// <param name="s">所插入的字符串。</param>
private void InsertSubString(string s)
{
if (!_rootTable.ContainsKey(s.Length) && s.Length > 0)
{
var _newHashtable = new Hashtable();
_rootTable.Add(s.Length, _newHashtable);
}
var _tempTable = (Hashtable) _rootTable[s.Length];
if (!_tempTable.ContainsKey(s))
{
_tempTable.Add(s, 1);
}
else
{
_tempTable[s] = (int) _tempTable[s] + 1;
}
}
}
}
ChineseWordUnit.cs
namespace FullTextSearch.Common
{
public struct ChineseWordUnit
{
private readonly int _power;
private readonly string _word;
/// <summary>
/// 结构初始化。
/// </summary>
/// <param name="word">中文词语</param>
/// <param name="power">该词语的权重</param>
public ChineseWordUnit(string word, int power)
{
_word = word;
_power = power;
}
/// <summary>
/// 中文词语单元所对应的中文词。
/// </summary>
public string Word
{
get { return _word; }
}
/// <summary>
/// 该中文词语的权重。
/// </summary>
public int Power
{
get { return _power; }
}
}
}
ChineseDictionary.txt
主窗体界面
MainManager.cs
using System;
using System.Collections.Generic;
using System.Data;
using System.Drawing;
using System.Windows.Forms;
using FullTextSearch.Common;
using Npgsql;
namespace FullTextSearch
{
public partial class MainManager : Form
{
private readonly PostgreSQL pg = new PostgreSQL();
private readonly SQLquerys sqlQuerys = new SQLquerys();
private char analysisType;
private string createConnString = "";
private DataSet dataSet = new DataSet();
private DataTable dataTable = new DataTable();
private char odabirAndOr;
private char vrstaPretrazivanja;
public MainManager()
{
InitializeComponent();
rbtn_AND.Checked = true;
rbtnNeizmjenjeni.Checked = true;
odabirAndOr = '*';
radioButton_Day.Checked = true;
radioButton_Day.Checked = true;
}
private void Form1_Load(object sender, EventArgs e)
{
gb_unosPodataka.Enabled = false;
groupBox_Search.Enabled = false;
groupBox_Analysis.Enabled = false;
button_Disconnect.Enabled = false;
button_Pretrazi.BackColor = Color.WhiteSmoke;
button_Disconnect.BackColor = Color.WhiteSmoke;
button_unosTekstaUBazu.BackColor = Color.WhiteSmoke;
button1.BackColor = Color.WhiteSmoke;
}
private void button_unosTekstaUBazu_Click(object sender, EventArgs e)
{
string searchTextBoxString = rTB_unosTextaUBazu.Text;
if (searchTextBoxString != "")
{
pg.insertIntoTable(searchTextBoxString, pg.conn);
MessageBox.Show(searchTextBoxString + " 添加到数据库!");
rTB_unosTextaUBazu.Clear();
}
else
{
MessageBox.Show("不允许空数据!");
}
}
private void button_Pretrazi_Click(object sender, EventArgs e)
{
string stringToSearch;
string sql;
string highlitedText;
string rank;
string check;
stringToSearch = txt_Search.Text.Trim();
var list = new List<string>(ChineseParse.ParseChinese(stringToSearch));
;
sql = sqlQuerys.createSqlString(list, odabirAndOr, vrstaPretrazivanja);
richTextBox1.Text = sql;
check = sqlQuerys.testIfEmpty(stringToSearch);
pg.insertIntoAnalysisTable(stringToSearch, pg.conn);
pg.openConnection();
var command = new NpgsqlCommand(sql, pg.conn);
NpgsqlDataReader reader = command.ExecuteReader();
int count = 0;
linkLabel_Rezultat.Text = " ";
while (reader.Read())
{
highlitedText = reader[1].ToString();
rank = reader[3].ToString();
linkLabel_Rezultat.Text += highlitedText + "[" + rank + "]\n";
count++;
}
labelBrojac.Text = "找到的文件数量: " + count;
pg.closeConnection();
}
private void rbtn_AND_CheckedChanged(object sender, EventArgs e)
{
odabirAndOr = '*';
}
private void rbtn_OR_CheckedChanged(object sender, EventArgs e)
{
odabirAndOr = '+';
}
private void rbtnNeizmjenjeni_CheckedChanged(object sender, EventArgs e)
{
vrstaPretrazivanja = 'A';
}
private void rbtn_Rijecnici_CheckedChanged(object sender, EventArgs e)
{
vrstaPretrazivanja = 'B';
}
private void rbtn_Fuzzy_CheckedChanged(object sender, EventArgs e)
{
vrstaPretrazivanja = 'C';
}
private void button_Connect_Click(object sender, EventArgs e)
{
if (connectMe())
{
gb_unosPodataka.Enabled = true;
groupBox_Search.Enabled = true;
groupBox_Analysis.Enabled = true;
textBox_Database.Enabled = false;
textBox_IP.Enabled = false;
textBox_Port.Enabled = false;
textBox_Password.Enabled = false;
textBox_UserID.Enabled = false;
button_Connect.Enabled = false;
button_Disconnect.Enabled = true;
button_Pretrazi.BackColor = Color.SkyBlue;
button_Disconnect.BackColor = Color.IndianRed;
button_unosTekstaUBazu.BackColor = Color.MediumSeaGreen;
button1.BackColor = Color.MediumSeaGreen;
button_Connect.BackColor = Color.WhiteSmoke;
}
}
private void button_Disconnect_Click(object sender, EventArgs e)
{
gb_unosPodataka.Enabled = false;
groupBox_Search.Enabled = false;
groupBox_Analysis.Enabled = false;
textBox_Database.Enabled = true;
textBox_IP.Enabled = true;
textBox_Port.Enabled = true;
textBox_Password.Enabled = true;
textBox_UserID.Enabled = true;
button_Connect.Enabled = true;
button_Disconnect.Enabled = false;
button_Pretrazi.BackColor = Color.WhiteSmoke;
button_Disconnect.BackColor = Color.WhiteSmoke;
button_unosTekstaUBazu.BackColor = Color.WhiteSmoke;
button1.BackColor = Color.WhiteSmoke;
button_Connect.BackColor = Color.MediumSeaGreen;
txt_Search.Text = "";
linkLabel_Rezultat.Text = "";
richTextBox1.Text = "";
labelBrojac.Text = "";
}
private bool connectMe()
{
createConnString += "Server=" + textBox_IP.Text + ";Port=" + textBox_Port.Text + ";User Id=" +
textBox_UserID.Text + ";Password=" + textBox_Password.Text + ";Database=" +
textBox_Database.Text + ";";
sqlQuerys.setTheKey(createConnString);
pg.setConnectionString();
pg.setConnection();
if (pg.openConnection())
{
MessageBox.Show("您已成功连接!");
pg.closeConnection();
return true;
}
return false;
}
private void button1_Click(object sender, EventArgs e)
{
string selectedTimestamp;
selectedTimestamp = dateTimePicker_From.Value.ToString("dd-MM-yyyy hh:mm:ss") + " " +
dateTimePicker_To.Value.ToString("dd-MM-yyyy hh:mm:ss");
var analize = new Analysis(selectedTimestamp, analysisType);
analize.Show();
}
private void radioButton_Day_CheckedChanged(object sender, EventArgs e)
{
analysisType = 'D';
}
private void radioButton_Hour_CheckedChanged(object sender, EventArgs e)
{
analysisType = 'H';
}
}
}
SQLquerys.cs代码:
using System.Collections.Generic;
namespace FullTextSearch
{
internal class SQLquerys
{
private static string giveMeTheKey;
private static int tempInt = 1;
//设置连接字符串
public void setTheKey(string connString)
{
giveMeTheKey = connString;
giveMeTheKey += "";
}
//将连接字符串存储在静态变量中
public string getTheKey()
{
giveMeTheKey += "";
return giveMeTheKey;
}
public void setCounter()
{
tempInt = 1;
}
//根据AND和OR的选择分析字符串进行搜索
public string createFunctionString(List<string> searchList, char selector)
{
string TempString = "";
string[] TempField = null;
int i = 0;
int j = 0;
foreach (string searchStringInList in searchList)
{
if (j != 0)
{
if (selector == '+')
TempString = TempString + " | ";
else if (selector == '*')
TempString = TempString + " & ";
}
j = 1;
TempField = splitListForInput(searchStringInList);
TempString = TempString + "(";
foreach (string justTempString in TempField)
{
if (i != 0)
{
TempString = TempString + " & ";
}
TempString = TempString + justTempString;
i = 1;
}
TempString = TempString + ")";
i = 0;
}
return TempString;
}
//帮助方法
public List<string> splitInputField(string[] inputField)
{
var unfinishedList = new List<string>();
foreach (string splitString in inputField)
{
unfinishedList.Add(splitString);
}
return unfinishedList;
}
//帮助方法
public string[] splitListForInput(string inputString)
{
string[] parsedList = null;
parsedList = inputString.Split(' ');
return parsedList;
}
//在PostgreSQL中创建ts功能的功能,用于字典搜索
public string createTsFunction(string tsString)
{
string tsHeadline = "";
string tsRank = "";
string tsFunction = "";
tsHeadline = ",\n ts_headline(\"content\", to_tsquery('" + tsString + "')), \"content\"";
tsRank = ",\n ts_rank(to_tsvector(\"content\"), to_tsquery('" + tsString + "')) rank";
tsFunction = tsHeadline + tsRank;
return tsFunction;
}
//创建SQL查询依赖于选择哪种类型的搜索,也取决于AND或OR选择器
public string createSqlString(List<string> searchList, char selector, char vrstaPretrazivanja)
{
string selectString = "";
string myTempString = "";
string TempString = "";
int i = 0;
TempString = createFunctionString(searchList, selector);
TempString = createTsFunction(TempString);
selectString = "SELECT \"id\"" + TempString + "\nFROM \"texttable\" \nWHERE ";
if (vrstaPretrazivanja == 'A')
{
foreach (string myString in searchList)
{
if (i == 0)
{
myTempString = myTempString + "\"content\" LIKE '%" + myString + "%' ";
i++;
}
else
{
if (selector == '*')
myTempString = myTempString + "\nAND \"content\" LIKE '%" + myString + "%' ";
else if (selector == '+')
myTempString = myTempString + "\nOR \"content\" LIKE '%" + myString + "%' ";
}
}
}
else if (vrstaPretrazivanja == 'B')
{
foreach (string myString in searchList)
{
string temporalString = "";
string[] testingString = myString.Split(' ');
for (int k = 0; k < testingString.Length; k++)
{
if (k != testingString.Length - 1)
{
temporalString += testingString[k] + " & ";
}
else
{
temporalString += testingString[k];
}
}
if (i == 0)
{
myTempString = myTempString + "to_tsvector(\"content\") @@ to_tsquery('english', '" +
temporalString + "')";
i++;
}
else
{
if (selector == '*')
myTempString = myTempString + "\nAND to_tsvector(\"content\") @@ to_tsquery('english', '" +
temporalString + "')";
else if (selector == '+')
myTempString = myTempString + "\nOR to_tsvector(\"content\") @@ to_tsquery('english', '" +
temporalString + "')";
}
}
}
if (vrstaPretrazivanja == 'C')
{
foreach (string myString in searchList)
{
if (i == 0)
{
myTempString = myTempString + "\"content\" % '" + myString + "' ";
i++;
}
else
{
if (selector == '*')
myTempString = myTempString + "\nAND \"content\" % '" + myString + "' ";
else if (selector == '+')
myTempString = myTempString + "\nOR \"content\" % '" + myString + "' ";
}
}
}
selectString = selectString + myTempString + "\nORDER BY rank DESC";
return selectString;
}
public string testIfEmpty(string searchedText)
{
string checkingIfEmpty = "SELECT * FROM \"analysisTable\" WHERE \"searchedtext\" =' " + searchedText + "'";
return checkingIfEmpty;
}
public string queryForAnalysis(char analysisChoice)
{
string myTestsql = "";
if (analysisChoice == 'H')
{
//这个查询是这样写的只是为了测试的目的,它需要改变
myTestsql = "SELECT * FROM crosstab('SELECT CAST((\"searchedtext\") AS text) searchedText,"
+
" CAST(EXTRACT(HOUR FROM \"timeOfSearch\") AS int) AS sat, CAST(COUNT(*) AS int) AS broj FROM \"analysisTable\" GROUP BY \"searchedText\", sat"
+
" ORDER BY \"searchedtext\", sat', 'SELECT rbrSata FROM sat ORDER BY rbrSata') AS pivotTable (\"searchedText\" TEXT, t0_1 INT, t1_2 INT"
+
", t2_3 INT, t3_4 INT, t4_5 INT, t5_6 INT, t6_7 INT, t7_8 INT, t8_9 INT, t9_10 INT, t10_11 INT, t11_12 INT, t12_13 INT"
+
", t13_14 INT, t14_15 INT, t15_16 INT, t16_17 INT, t17_18 INT, t18_19 INT, t19_20 INT, t20_21 INT, t21_22 INT, t22_23 INT, t23_00 INT) ORDER BY \"searchedText\"";
return myTestsql;
}
if (analysisChoice == 'D')
{
//这个查询是这样写的只是为了测试的目的,它需要改变
myTestsql += "SELECT *FROM crosstab ('SELECT CAST((\"searchedtext\") AS text) AS searchedText, CAST(EXTRACT(DAY FROM \"dateOfSearch\") AS int) AS dan"
+ ", CAST(COUNT(*) AS int) AS broj FROM \"analysisTable\" GROUP BY \"searchedText\", "
+
"dan ORDER BY \"searchedtext\", dan', 'SELECT rbrDana FROM dan ORDER BY rbrDana') AS pivotTable(\"searchedtext\" TEXT";
return myTestsql;
}
return myTestsql;
}
//此方法用于解析日期
public int[] parseForDates(string date)
{
string[] temp;
var tempInt = new int[3];
temp = date.Split('-');
for (int i = 0; i < 3; i++)
{
tempInt[i] = int.Parse(temp[i]);
}
return tempInt;
}
//此代码用于创建分析,它执行一些日期/时间操作,以便能够为选定的日期/时间创建分析。
public string createSqlForDayAnalysis(string dateFrom, string dateTo)
{
string insertIntoTempTable = "";
string dateTimeForAnalysis = "";
int[] tempFrom = parseForDates(dateFrom);
int[] tempTo = parseForDates(dateTo);
//月份变更算法
while (tempFrom[0] != tempTo[0] || tempFrom[1] != tempTo[1])
{
if (tempFrom[1] == tempTo[1])
{
if (tempFrom[0] != tempTo[0])
{
for (int i = tempInt + 1; tempFrom[0] + 2 < tempTo[0] + 2; i++)
{
insertIntoTempTable += "INSERT INTO \"dan\" VALUES (" + i + ");";
dateTimeForAnalysis += ",dd" + tempFrom[0] + tempFrom[1] + tempFrom[2] + " INT";
tempInt = i;
tempFrom[0]++;
}
}
}
if (tempFrom[1] != tempTo[1])
{
if (tempFrom[1]%2 == 0 || tempFrom[1] == 7 || tempFrom[1] == 1)
{
for (int i = tempInt; tempFrom[0] < 31 && tempFrom[1] != tempTo[1]; i++)
{
insertIntoTempTable += "INSERT INTO \"dan\" VALUES (" + i + ");";
dateTimeForAnalysis += ", dd" + tempFrom[0] + tempFrom[1] + tempFrom[2] + " INT";
tempInt = i;
tempFrom[0]++;
if (tempFrom[0] == 31)
{
tempFrom[1]++;
tempFrom[0] = 1;
}
}
}
}
}
dateTimeForAnalysis += ") ORDER BY \"searchedtext\"";
return dateTimeForAnalysis + "#" + insertIntoTempTable;
}
}
}
PostgreSQL.cs代码:
using System;
using System.Windows.Forms;
using Npgsql;
using NpgsqlTypes;
namespace FullTextSearch
{
public class PostgreSQL
{
private static int tempInt = 1;
private readonly SQLquerys sql = new SQLquerys();
public NpgsqlConnection conn;
public string connectionstring;
private string newConnString;
public PostgreSQL()
{
setConnectionString();
setConnection();
}
public void setConnectionString()
{
newConnString = sql.getTheKey();
connectionstring = String.Format(newConnString);
setConnection();
}
public void setConnection()
{
conn = new NpgsqlConnection(connectionstring);
}
public bool openConnection()
{
try
{
conn.Open();
return true;
}
catch
{
MessageBox.Show("Unable to connect! Check parameters!");
return false;
}
}
public void closeConnection()
{
conn.Close();
}
public void insertIntoTable(string textToInsert, NpgsqlConnection nsqlConn)
{
string mySqlString = "INSERT INTO \"texttable\" (\"content\") VALUES (@Param1)";
var myParameter = new NpgsqlParameter("@Param1", NpgsqlDbType.Text);
myParameter.Value = textToInsert;
openConnection();
var myCommand = new NpgsqlCommand(mySqlString, nsqlConn);
myCommand.Parameters.Add(myParameter);
myCommand.ExecuteNonQuery();
closeConnection();
}
public void insertIntoAnalysisTable(string textToInsert, NpgsqlConnection nsqlConn)
{
string dateTime = DateTime.Now.ToString();
string[] temp;
temp = dateTime.Split(' ');
string mySqlString =
"INSERT INTO \"analysistable\" (\"searchedtext\", \"dateofsearch\", \"timeofsearch\") VALUES ('" +
textToInsert + "', '" + temp[0] + "'" + ", '" + temp[1] + "');";
openConnection();
var myCommand = new NpgsqlCommand(mySqlString, nsqlConn);
myCommand.ExecuteNonQuery();
closeConnection();
}
public void executeQuery(string queryText, NpgsqlConnection nsqlConn)
{
openConnection();
var myCommand = new NpgsqlCommand(queryText, nsqlConn);
myCommand.ExecuteNonQuery();
closeConnection();
}
public void createTempTable(NpgsqlConnection nsqlConn, char analysisType, string dateFrom, string dateTo,
string splitMe)
{
if (analysisType == 'H')
{
string dropIfExists = "DROP TABLE IF EXISTS \"sat\";";
string createTempTable = "CREATE TABLE IF NOT EXISTS \"sat\" (rbrSata INT);";
string insertIntoTempTable = "";
for (int i = 0; i < 24; i++)
{
insertIntoTempTable += "INSERT INTO \"sat\" VALUES (" + i + ");";
}
openConnection();
var commandDrop = new NpgsqlCommand(dropIfExists, nsqlConn);
commandDrop.ExecuteNonQuery();
var commandCreate = new NpgsqlCommand(createTempTable, nsqlConn);
commandCreate.ExecuteNonQuery();
var commandInsert = new NpgsqlCommand(insertIntoTempTable, nsqlConn);
commandInsert.ExecuteNonQuery();
closeConnection();
}
else if (analysisType == 'D')
{
string dropIfExists = "DROP TABLE IF EXISTS \"dan\";";
string createTempTable = "CREATE TABLE IF NOT EXISTS \"dan\" (rbrDana INT);";
string insertIntoTempTable = splitMe;
openConnection();
var commandDrop = new NpgsqlCommand(dropIfExists, nsqlConn);
commandDrop.ExecuteNonQuery();
var commandCreate = new NpgsqlCommand(createTempTable, nsqlConn);
commandCreate.ExecuteNonQuery();
var commandInsert = new NpgsqlCommand(insertIntoTempTable, nsqlConn);
commandInsert.ExecuteNonQuery();
closeConnection();
}
}
}
}
PostGreSQL sql脚本:
CREATE TABLE public.analysistable
(
id integer NOT NULL DEFAULT nextval('analysistable_id_seq'::regclass),
searchedtext text COLLATE pg_catalog."default" NOT NULL,
dateofsearch date NOT NULL,
timeofsearch time without time zone NOT NULL,
CONSTRAINT analysistable_pkey PRIMARY KEY (id)
)
WITH (
OIDS = FALSE
)
TABLESPACE pg_default;
ALTER TABLE public.analysistable
OWNER to king;
CREATE TABLE public.texttable
(
id integer NOT NULL DEFAULT nextval('texttable_id_seq'::regclass),
content text COLLATE pg_catalog."default" NOT NULL,
CONSTRAINT texttable_pkey PRIMARY KEY (id)
)
WITH (
OIDS = FALSE
)
TABLESPACE pg_default;
ALTER TABLE public.texttable
OWNER to king;
运行结果如图: