import os import pandas as pd import jieba # 加载停用词 def load_stopwords(filenames): stopwords = set() for filename in filenames: with open(filename, 'r', encoding='utf-8') as f: for line in f: stopwords.add(line.strip()) return stopwords # 中文分词并去除停用词 def segment_and_remove_stopwords(text, stopwords): words = jieba.cut(text) filtered_words = [word for word in words if word not in stopwords and len(word) > 1] return ' '.join(filtered_words) # 处理评论数据 def process_comments(df, comment_column, stopwords): df['connected_words'] = df[comment_column].apply(lambda x: segment_and_remove_stopwords(x, stopwords)) return df # 主函数 def main(input_file_path, output_file_path, comment_column, stopwords_files=[]): # 加载停用词 stopwords = load_stopwords(stopwords_files) # 读取CSV文件 df = pd.read_csv(input_file_path, encoding='utf-8') # 处理评论数据 processed_df = process_comments(df, comment_column, stopwords) # 保存处理后的数据到新的CSV文件 processed_df.to_csv(output_file_path, index=False, encoding='utf-8-sig') print(f"数据预处理完成,已保存到 {output_file_path}") if __name__ == '__main__': input_file_path = r"D:\pycharm\爬虫案列\24.汽车之家\_0_10.csv" # 你的CSV文件路径 output_file_path = 'comments_processed.csv' # 输出文件的路径 comment_column = '空间' # 假设评论数据在'comment'列中 # 停用词文件列表,确保这些文件在你的工作目录中 stopwords_files = [ r"stopwords-master\baidu_stopwords.txt", r"stopwords-master\cn_stopwords.txt", r"stopwords-master\hit_stopwords.txt", r"stopwords-master\scu_stopwords.txt", # ... 其他停用词文件 ] # 确保所有停用词文件都存在 for filename in stopwords_files: if not os.path.exists(filename): print(f"Stopwords file {filename} not found.") exit(1) # 调用主函数处理评论数据 main(input_file_path, output_file_path, comment_column, stopwords_files)
停用词表可以去看一下博主的上传的资源 , 可以免费获取的