# -*- coding: utf-8 -*-# import scrapyfrommathimportceilimportreimportjsonimportrequests### def get_opencalais_results(text, api_key):# # 设置API URL和请求头# url = 'https://api.thomsonreuters.com/permid/calais'# headers = {'X-AG-Access-Token': api_key, 'Content-Type': 'text/raw', 'outputformat': 'application/json'}## # 发送请求并获取响应# response = requests.post(url, data=text.encode('utf-8'), headers=headers)# response_dict = json.loads(response.text)## # 从响应中提取实体# entities = {}# for key, value in response_dict.items():# if '_typeGroup' in value and value['_typeGroup'] == 'entities':# if 'name' in value and 'score' in value:# entities.setdefault(value['_type'], []).append((value['name'], value['score']))## return entities### # 调用OpenCalais API并处理结果# text = "Facebook is investing $5.7 billion in Reliance Jio, a move that will give the social media giant a 9.99% stake in the Indian telecom company."# api_key = "your_api_key"## entities = get_opencalais_results(text, api_key)## # 输出结果# for entity_type, values in entities.items():# print(entity_type.upper())# print('-' * 20)# for value in values:# print(value[0], value[1])# print('\n')# import spacy## nlp = spacy.load("en_core_web_sm")# doc = nlp("Apple is looking at buying U.K. startup for $1 billion")## for token in doc:# print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,# token.shape_, token.is_alpha, token.is_stop)importjsonimportrequestsfromurllibimportrequestimporturllibimportreimportpandasaspdfromopenpyxlimportWorkbookfromopenpyxlimportload_workbookclassYouDaoTranslator(object): deffanyi(self,key): print('--key-', key) # -----伪装浏览器进行爬虫# header = {# "User-Agent": " Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.54"}# url = "http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule"header= { # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36','Accept': 'application/json, text/javascript, */*; q=0.01', # 'Accept-Encoding': 'gzip, deflate','Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 'Connection': 'keep-alive', # 'Content-Length': '223','Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Cookie': 'OUTFOX_SEARCH_USER_ID=-493176930@10.168.8.63; OUTFOX_SEARCH_USER_ID_NCOO=38624120.26076847; SESSION_FROM_COOKIE=unknown; JSESSIONID=aaabYcV4ZOU-JbQUha2uw; ___rl__test__cookies=1534210912076', 'Host': 'fanyi.youdao.com', 'Origin': 'http://fanyi.youdao.com', 'Referer': 'http://fanyi.youdao.com/', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest', } url='http://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule'salt=int(time.time() *1000+random.randint(0, 10)) salt_str=str(salt) D="ebSeFb%=XZ%T[KZ)c(sy!"S="fanyideskweb"sign_str=S+key+salt_str+D# 调用加密的方法sign_md5_str=md5_jiami(sign_str) # -----增加参数formdata= {} formdata['i'] =keyformdata['from'] ='AUTO'formdata['to'] ='AUTO'formdata['smartresult'] ='dict'formdata['client'] ='fanyideskweb'# formdata['salt'] = '15821157689747'# formdata['sign'] = 'd5a392995c28c285198043f7111d1d00'formdata['salt'] =salt_str, formdata['sign'] =sign_md5_str, formdata['ts'] ='1582115768974'formdata['bv'] ='ec579abcd509567b8d56407a80835950'formdata['doctype'] ='json'formdata['version'] ='2.1'formdata['keyfrom'] ='fanyi.web'formdata['action'] ='FY_BY_CLICKBUTTION'data=urllib.parse.urlencode(formdata).encode('utf-8') # -----爬虫req=request.Request(url, data=data, headers=header) # -----解析resp=request.urlopen(req).read().decode() pat=r'"tgt":"(.*?)"}]]'result1=re.findall(pat, resp) print('--result1-',result1) returnresult1[0] deftranslator(self,str): """ input : str 需要翻译的字符串 output:translation 翻译后的字符串 """# APIurl='http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule&smartresult=ugc&sessionFrom=null'# 传输的参数, i为要翻译的内容key= { 'type': "AUTO", 'i': str, "doctype": "json", "version": "2.1", "keyfrom": "fanyi.web", "ue": "UTF-8", "action": "FY_BY_CLICKBUTTON", "typoResult": "true" } # key 这个字典为发送给有道词典服务器的内容response=requests.post(url, data=key) # 判断服务器是否相应成功ifresponse.status_code==200: # 通过 json.loads 把返回的结果加载成 json 格式result=json.loads(response.text) # print ("输入的词为:%s" % result['translateResult'][0][0]['src'])# print ("翻译结果为:%s" % result['translateResult'][0][0]['tgt'])translation=result['translateResult'][0][0]['tgt'] returntranslationelse: print("有道词典调用失败") # 相应失败就返回空returnNonedeftranslatorAll(self,chineseList=None): ifchineseListisNone: chineseList= {} forchineseinchineseList: print(chinese.strip()) translation=self.translator(chinese.strip()) chineseList[chinese] =translation.lower().replace(' ', '_').replace('\"', '').replace('.', '').strip("_") print(translation.lower().replace(' ', '_').replace('\"', '').replace('.', '').strip("_")) importjson,time,randomimporthashlibdefmd5_jiami(str_data): md5_obj=hashlib.md5() sign_bytes_data=str_data.encode('utf-8') # 调用update()函数,来更新md5_obj值md5_obj.update(sign_bytes_data) # 返回加密后的strsign_str=md5_obj.hexdigest() returnsign_strimportenchantdefis_spelled_correctly(word, lang='en_US'): ifword=='nan':return""if' 'inword: returnworddictionary=enchant.Dict(lang) foriinrange(1, len(word)): first_word=word[:i] second_word=word[i:] ifdictionary.check(first_word) anddictionary.check(second_word): print(word,first_word,second_word) returnfirst_word+" "+second_wordreturnworddefremove_common_prefix(row): # print('row[company]:',row['company'])# 将两个字符串拆分为字符序列seq1_list=str(row['company']).lower().split('|') seq2_list=str(row['brand1']).lower().split('|') # 比较每一个字符,找到最长公共前缀new_seq2_list= [] forseq1inseq1_list: forseq2inseq2_list: i=0j=0p=0whilei<len(seq1) andj<len(seq2): if (seq1[i] ==seq2[j]): i+=1j+=1if (i==len(seq1)-1): new_seq2_list.append(seq2[j+1:].strip()) breakelse: if (p==len(seq1)-1): new_seq2_list.append(seq2.strip()) breakp+=1i=pj=0return'|'.join(set(new_seq2_list)) if__name__=='__main__': #分词# is_spelled_correctly('controlmicrosystems')#剔除公司前缀# row={}# row['company']=''# row['brand1']='apache spamassassin'# result=remove_common_prefix(row)# print(result)#同一个remark合并属性# import pandas as pd## # 创建DataFrame# df = pd.DataFrame({# 'cev': ['cev1', 'cev2', 'cev3', 'cev4'],# 'remark': ['remarK1', 'remark1', 'remark2', 'remark2'],# 'company': ['com1', 'com2', 'com3', 'com4'],# 'brand': ['brand1', 'brand2', 'brand3', 'brand4'],# 'system': ['sys1', 'sys2', 'sys3', 'sys4']# })### # 定义一个聚合函数,用于对其他列进行拼接# def join_cols(s):# return '|'.join(s)### # 对 df 按照 remark 列分组,对其他列进行拼接# df['remark'] = df['remark'].map(str.lower)# grouped = df.groupby('remark').agg(join_cols)# grouped=grouped.reset_index()# print(grouped)################################################################有道翻译# pattern = r'(\d+)\.([a-zA-Z]+)'# text = '** DISPUTED ** Directory traversal vulnerability in check_vote.php in Weekly Drawing Contest 0.0.1 allows remote attackers to read arbitrary files via a .. (dot dot) in the order parameter. NOTE: another researcher disputes this vulnerability, noting that the order variable is not used in any context that allows opening files.'# # 将数字和字母之间的 '.' 替换为 '_'# text = re.sub(pattern, r'\1_\2', text)# print(text)## mytranslator=YouDaoTranslator()# # result = mytranslator.translator(text)# result = mytranslator.fanyi(text)# print(result)###############################################################有道再翻译# pattern = r'(\d+)\.([a-zA-Z]+)'# mytranslator = YouDaoTranslator()## # #使用翻译接口对爬虫数据转中文# exist_data = pd.read_excel("all_spyder(中文).xlsx")# exist_cves=list(exist_data['cveid'])## data_output = pd.read_excel("all_spyder.xlsx")# ##去重# print(data_output.columns)# print(data_output.head())## filename = "all_spyder(中文new).xlsx"## # 遍历 dataframe 的每一行# for index, row in data_output.iterrows():# if index<0: continue #20472# # print(f"row: {row}")## print(f"index: {index} {row['cveid']}") # 打印当前行的索引## if row['cveid'] in exist_cves:# print(f"{row['cveid']}已存在,跳过!!")# continue## if (len(str(row['describe']))<200)|(len(str(row['describe']))>400):# print('长度不合规:{0}'.format(len(str(row['describe']))))# print(row['describe'])# continue# try:# workbook = load_workbook(filename)# worksheet = workbook.active# except FileNotFoundError:# workbook = Workbook()# worksheet = workbook.active# header = ['cveid', 'describe_zh','describe', 'score', 'vulntype', 'producttype', 'vendor', 'product', 'version']# worksheet.append(header)# try :# # row['describe_zh']=en2zh(row['describe'])# text = re.sub(pattern, r'\1_\2', row['describe'])# result= mytranslator.fanyi(text)# if "},{" in result:# row['describe_zh'] = '英文句号翻译错误!!'# print('英文句号翻译错误!!')# else:# row['describe_zh'] = result# print('翻译:',row['describe_zh'])# except Exception as e:# row['describe_zh']='接口调用报错!!'# print('接口调用报错!!',str(e))# finally:# values = [row['cveid'], row['describe_zh'],row['describe'], row['score'], row['vulntype'], row['producttype'], row['vendor'],# row['product'], row['version']]# worksheet.append(values)## workbook.save(filename)########################################################### 谷歌翻译接口# from pygtrans import Translate## client = Translate()# text = client.translate('Google Translate')# print(text.translatedText) # 谷歌翻译########################################################## 获取时间戳1# start = time.perf_counter()# time.sleep(1)# # 获取结束时间# end = time.perf_counter()# # 计算运行时间# runTime = end - start# runTime_ms = runTime * 1000# # 输出运行时间# print("运行时间:", runTime, "秒")# print("运行时间:", runTime_ms, "毫秒")## exit()frompygtransimportTranslateimporttimeclient=Translate() end=0# #使用翻译接口对爬虫数据转中文exist_data=pd.read_excel("all_spyder(中文).xlsx") exist_cves=list(exist_data['cve']) # data_output = pd.read_excel("all_spyder.xlsx")data_output=pd.read_excel("output(2015).xlsx") data_output=data_output.rename( columns={'cveid': 'cve', 'describe_zh': 'remark', 'vulntype': 'system', 'vendor': 'company', 'product': 'brand'}) ##去重print(data_output.columns) print(data_output.head()) print_index=0result=[] # 遍历 dataframe 的每一行forindex, rowindata_output.iterrows(): ifindex<0: continue#20472# print(f"row: {row}")print(f"index: {index}{row['cve']}") # 打印当前行的索引ifrow['cve'] inexist_cves: print(f"{row['cve']}已存在,跳过!!") continueif (len(str(row['describe']))<100)|(len(str(row['describe']))>200): print('长度不合规!!') # print('长度不合规:{0}'.format(len(str(row['describe']))))# print(row['describe'])continuetry : start=time.perf_counter() while ((end-start)<0.05)&(end!=0): # print('等待....')end=time.perf_counter() text=client.translate(row['describe']) end=time.perf_counter() # print(text.translatedText) # 谷歌翻译row['remark'] =text.translatedTextprint('翻译:',row['remark']) print_index+=1exceptExceptionase: row['remark']='接口调用报错!!'print('接口调用报错!!',str(e)) finally: values= [row['cve'], row['remark'],row['describe'], row['score'], row['system'], row['producttype'], row['company'], row['brand'], row['version']] result.append(values) ifprint_index%20==0: filename="all_spyder(中文new)_{0}.xlsx".format(print_index//1000) print('写入...',filename) try: workbook=load_workbook(filename) worksheet=workbook.activeexceptFileNotFoundError: workbook=Workbook() worksheet=workbook.activeheader= ['cve', 'remark', 'describe', 'score', 'system', 'producttype', 'company', 'brand', 'version'] worksheet.append(header) finally: forrinresult: worksheet.append(r) workbook.save(filename) result=[] workbook.close()