手记实用系列文章:
3 自然语言处理手记
语料预处理封装类:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
|
#coding=utf-8
import os
import jieba
import sys
import re
import time
import jieba.posseg
as
pseg
sys.path.append(
"../"
)
jieba.load_userdict(
"../Database/userdict.txt"
) # 加载自定义分词词典
''
'
title:利用结巴分词进行文本语料处理:单文本处理器、批量文件处理器
1 首先对文本进行遍历查找
2 创建原始文本的保存结构
3 对原文本进行结巴分词和停用词处理
4 对预处理结果进行标准化格式,并保存原文件结构路径
author:白宁超
myblog:http:
//www.cnblogs.com/baiboy/
''
'
''
'
分词.词性标注以及去停用词
stopwordspath: 停用词路径
dealpath:中文数据预处理文件的路径
savepath:中文数据预处理结果的保存路径
''
'
def cutTxtWord(dealpath,savepath,stopwordspath):
stopwords = {}.fromkeys([ line.rstrip()
for
line
in
open(stopwordspath,
"r"
,encoding=
'utf-8'
)]) # 停用词表
with open(dealpath,
"r"
,encoding=
'utf-8'
)
as
f:
txtlist=f.read() # 读取待处理的文本
words =pseg.cut(txtlist) # 带词性标注的分词结果
cutresult=
""
# 获取去除停用词后的分词结果
for
word, flag
in
words:
if
word not
in
stopwords:
cutresult += word+
"/"
+flag+
" "
#去停用词
getFlag(cutresult,savepath) #
''
'
分词.词性标注以及去停用词
stopwordspath: 停用词路径
read_folder_path :中文数据预处理文件的路径
write_folder_path :中文数据预处理结果的保存路径
filescount=300 #设置文件夹下文件最多多少个
''
'
def cutFileWord(read_folder_path,write_folder_path,stopwordspath):
# 停用词表
stopwords = {}.fromkeys([ line.rstrip()
for
line
in
open(stopwordspath,
"r"
,encoding=
'utf-8'
)])
# 获取待处理根目录下的所有类别
folder_list = os.listdir(read_folder_path)
# 类间循环
for
folder
in
folder_list:
#某类下的路径
new_folder_path = os.path.
join
(read_folder_path, folder)
# 创建保存文件目录
path=write_folder_path+folder #保存文件的子文件
isExists=os.path.exists(path)
if
not isExists:
os.makedirs(path)
print(path+
' 创建成功'
)
else
: pass
save_folder_path = os.path.
join
(write_folder_path, folder)#某类下的保存路径
print(
'--> 请稍等,正在处理中...'
)
# 类内循环
files = os.listdir(new_folder_path)
j = 1
for
file
in
files:
if
j > len(files):
break
dealpath = os.path.
join
(new_folder_path, file) #处理单个文件的路径
with open(dealpath,
"r"
,encoding=
'utf-8'
)
as
f:
txtlist=f.read()
# python 过滤中文、英文标点特殊符号
# txtlist1 = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()]+", "",txtlist)
words =pseg.cut(txtlist) # 带词性标注的分词结果
cutresult=
""
# 单个文本:分词后经停用词处理后的结果
for
word, flag
in
words:
if
word not
in
stopwords:
cutresult += word+
"/"
+flag+
" "
#去停用词
savepath = os.path.
join
(save_folder_path,file)
getFlag(cutresult,savepath)
j += 1
''
'
做词性筛选
cutresult:str类型,初切分的结果
savepath: 保存文件路径
''
'
def getFlag(cutresult,savepath):
txtlist=[] #过滤掉的词性后的结果
#词列表为自己定义要过滤掉的词性
cixing=[
"/x"
,
"/zg"
,
"/uj"
,
"/ul"
,
"/e"
,
"/d"
,
"/uz"
,
"/y"
]
for
line
in
cutresult.split(
'\n'
):
line_list2=re.split(
'[ ]'
, line)
line_list2.append(
"\n"
) # 保持原段落格式存在
line_list=line_list2[:]
for
segs
in
line_list2:
for
K
in
cixing:
if
K
in
segs:
line_list.remove(segs)
break
else
:
pass
txtlist.extend(line_list)
# 去除词性标签
resultlist=txtlist[:]
flagresult=
""
for
v
in
txtlist:
if
"/"
in
v:
slope=v.index(
"/"
)
letter=v[0:slope]+
" "
flagresult+= letter
else
:
flagresult+= v
standdata(flagresult,savepath)
''
'
标准化处理,去除空行,空白字符等。
flagresult:筛选过的结果
''
'
def standdata(flagresult,savepath):
f2=open(savepath,
"w"
,encoding=
'utf-8'
)
for
line
in
flagresult.split(
'\n'
):
if
len(line)>=2:
line_clean=
"/ "
.
join
(line.split())
lines=line_clean+
" "
+
"\n"
f2.write(lines)
else
: pass
f2.close()
if
__name__ ==
'__main__'
:
t1=time.time()
# 测试单个文件
dealpath=
"../Database/SogouC/FileTest/1.txt"
savepath=
"../Database/SogouCCut/FileTest/1.txt"
stopwordspath=
'../Database/stopwords/CH_stopWords.txt'
stopwordspath1=
'../Database/stopwords/HG_stopWords.txt'
# 哈工大停用词表
# 批量处理文件夹下的文件
# rfolder_path = '../Database/SogouC/Sample/'
rfolder_path =
'../Database/SogouC/FileNews/'
# 分词处理后保存根路径
wfolder_path =
'../Database/SogouCCut/'
# 中文语料预处理器
# cutTxtWord(dealpath,savepath,stopwordspath) # 单文本预处理器
cutFileWord(rfolder_path,wfolder_path,stopwordspath) # 多文本预处理器
t2=time.time()
print(
"中文语料语处理完成,耗时:"
+str(t2-t1)+
"秒。"
) #反馈结果
|
执行结果:
转自:https://www.cnblogs.com/baiboy/p/7676251.html