hmm CDN检测

简介:
复制代码
# -*- coding:utf-8 -*-
 
import sys
import re
from hmmlearn import hmm
import numpy as np
from sklearn.externals import joblib
import matplotlib.pyplot as plt
import tldextract
import os


def iterbrowse(path):          
    for home, dirs, files in os.walk(path): 
        for filename in files: 
            yield os.path.join(home, filename)


def extract_domain(domain):
    suffix = {'.com','.la','.io', '.co', '.cn','.info', '.net', '.org','.me', '.mobi', '.us', '.biz', '.xxx', '.ca', '.co.jp', '.com.cn', '.net.cn', '.org.cn', '.mx','.tv', '.ws', '.ag', '.com.ag', '.net.ag', '.org.ag','.am','.asia', '.at', '.be', '.com.br', '.net.br', '.name', '.live', '.news', '.bz', '.tech', '.pub', '.wang', '.space', '.top', '.xin', '.social', '.date', '.site', '.red', '.studio', '.link', '.online', '.help', '.kr', '.club', '.com.bz', '.net.bz', '.cc', '.band', '.market', '.com.co', '.net.co', '.nom.co', '.lawyer', '.de', '.es', '.com.es', '.nom.es', '.org.es', '.eu', '.wiki', '.design', '.software', '.fm', '.fr', '.gs', '.in', '.co.in', '.firm.in', '.gen.in', '.ind.in', '.net.in', '.org.in', '.it', '.jobs', '.jp', '.ms', '.com.mx', '.nl','.nu','.co.nz','.net.nz', '.org.nz', '.se', '.tc', '.tk', '.tw', '.com.tw', '.idv.tw', '.org.tw', '.hk', '.co.uk', '.me.uk', '.org.uk', '.vg'}

    domain = domain.lower()
    names = domain.split(".")
    if len(names) >= 3: 
    if ("."+".".join(names[-2:])) in suffix:
        return ".".join(names[-3:]), ".".join(names[:-3]) 
    elif ("."+names[-1]) in suffix:
        return ".".join(names[-2:]), ".".join(names[:-2]) 
    print "New domain suffix found. Use tld extract domain..."

    pos = domain.rfind("/")
    if pos >= 0: # maybe subdomain contains /, for dns tunnel tool
        ext = tldextract.extract(domain[pos+1:])
        subdomain = domain[:pos+1] + ext.subdomain
    else: 
        ext = tldextract.extract(domain)
        subdomain = ext.subdomain
    if ext.suffix:
        mdomain = ext.domain + "." + ext.suffix
    else:
        mdomain = ext.domain
    return mdomain, subdomain
 

def parse(log):
    data = log.split('^')
    SRC_PORT_IDX = 5-1
    DST_PORT_IDX = 6-1
    PROTOCOL_IDX = 7-1
    protol  = data[PROTOCOL_IDX]
    dstport = data[DST_PORT_IDX]
    if '17' == protol and ('53' == dstport):
    DNS_QUERY_NAME_IDX = 55-1 # domain
    if (len(data) < 55):
        print "error line:"
        print log
        return ("", "")
    domain = data[DNS_QUERY_NAME_IDX]
    mdomain, subdomain = extract_domain(domain)
    return (mdomain, subdomain)
    else:
    print "error line not a DNS:"
    print log
    return ("", "")


 
#处理域名的最小长度
MIN_LEN=3
 
#状态个数
N=5
#最大似然概率阈值
T=-50
 
#模型文件名
FILE_MODEL="hmm-cdn.m"
 

def get_cdn_domains(dir_path):
    domain_list=[]
    for path in iterbrowse(dir_path):
    with open(path) as f:
        for line in f:
        mdomain, sub_domain = parse(line)
        if len(sub_domain) >= MIN_LEN:
            domain_list.append(sub_domain)
                if len(domain_list) >= 2000:
                    return domain_list
                #else:
                #    print path, "pass line:", line
    return  domain_list
    

def domain2ver(domain):
    ver=[]
    for i in range(0,len(domain)):
        ver.append([ord(domain[i])])
    return ver
 

def train_hmm(domain_list):
    X = [[0]]
    X_lens = [1]
    for domain in domain_list:
        ver=domain2ver(domain)
        np_ver = np.array(ver)
        #print len(np_ver)
        try:
        X=np.concatenate([X,np_ver])
    except ValueError:
            print domain
            print len(X), len(np_ver)
            print X
            print np_ver
            raise
        X_lens.append(len(np_ver))
 
    remodel = hmm.GaussianHMM(n_components=N, covariance_type="full", n_iter=100)
    remodel.fit(X,X_lens)
    joblib.dump(remodel, FILE_MODEL)
 
    return remodel
 

def test(remodel, domain_list):
    x=[]
    y=[]
    for domain in domain_list:
        domain_ver=domain2ver(domain)
        np_ver = np.array(domain_ver)
        pro = remodel.score(np_ver)
        print  "SCORE:(%d) DOMAIN:(%s) " % (pro, domain)
        x.append(len(domain))
        y.append(pro)
    return x,y
 
 
if __name__ == '__main__':
    domain_list = get_cdn_domains("/home/bonelee/latest_metadata_sample/labeled_cdn")
    remodel=train_hmm(domain_list)
    remodel=joblib.load(FILE_MODEL)

    x_1,y_1=test(remodel, domain_list)
    print x_1
    print y_1
    #sys.exit(0)
    domain_list = get_cdn_domains("/home/bonelee/latest_metadata_sample/labeled_black")
    x_2,y_2=test(remodel, domain_list)
    print x_2
    print y_2
    domain_list = get_cdn_domains("/home/bonelee/latest_metadata_sample/labeled_white_like")
    x_3,y_3=test(remodel, domain_list)
    print x_3
    print y_3
    #%matplotlib inline
    fig,ax=plt.subplots()
    ax.set_xlabel('Domain Length')
    ax.set_ylabel('HMM Score')
    ax.scatter(x_3,y_3,color='b',label="WHITE")
    ax.scatter(x_2, y_2, color='g', label="BLACK")
    ax.scatter(x_1, y_1, color='r', label="CDN")
    ax.legend(loc='right')
    plt.show()
复制代码

 

使用pickle保存和加载模型:

复制代码
# -*- coding:utf-8 -*-
 
import sys
import re
from hmmlearn import hmm
import numpy as np
#from sklearn.externals import joblib
import matplotlib.pyplot as plt
import tldextract
import os
import pickle

def iterbrowse(path):          
    for home, dirs, files in os.walk(path): 
        for filename in files: 
            yield os.path.join(home, filename)


def extract_domain(domain):
    suffix = {'.com','.la','.io', '.co', '.cn','.info', '.net', '.org','.me', '.mobi', '.us', '.biz', '.xxx', '.ca', '.co.jp', '.com.cn', '.net.cn', '.org.cn', '.mx','.tv', '.ws', '.ag', '.com.ag', '.net.ag', '.org.ag','.am','.asia', '.at', '.be', '.com.br', '.net.br', '.name', '.live', '.news', '.bz', '.tech', '.pub', '.wang', '.space', '.top', '.xin', '.social', '.date', '.site', '.red', '.studio', '.link', '.online', '.help', '.kr', '.club', '.com.bz', '.net.bz', '.cc', '.band', '.market', '.com.co', '.net.co', '.nom.co', '.lawyer', '.de', '.es', '.com.es', '.nom.es', '.org.es', '.eu', '.wiki', '.design', '.software', '.fm', '.fr', '.gs', '.in', '.co.in', '.firm.in', '.gen.in', '.ind.in', '.net.in', '.org.in', '.it', '.jobs', '.jp', '.ms', '.com.mx', '.nl','.nu','.co.nz','.net.nz', '.org.nz', '.se', '.tc', '.tk', '.tw', '.com.tw', '.idv.tw', '.org.tw', '.hk', '.co.uk', '.me.uk', '.org.uk', '.vg'}

    domain = domain.lower()
    names = domain.split(".")
    if len(names) >= 3: 
    if ("."+".".join(names[-2:])) in suffix:
        return ".".join(names[-3:]), ".".join(names[:-3]) 
    elif ("."+names[-1]) in suffix:
        return ".".join(names[-2:]), ".".join(names[:-2]) 
    print "New domain suffix found. Use tld extract domain..."

    pos = domain.rfind("/")
    if pos >= 0: # maybe subdomain contains /, for dns tunnel tool
        ext = tldextract.extract(domain[pos+1:])
        subdomain = domain[:pos+1] + ext.subdomain
    else: 
        ext = tldextract.extract(domain)
        subdomain = ext.subdomain
    if ext.suffix:
        mdomain = ext.domain + "." + ext.suffix
    else:
        mdomain = ext.domain
    return mdomain, subdomain
 

def parse(log):
    data = log.split('^')
    SRC_PORT_IDX = 5-1
    DST_PORT_IDX = 6-1
    PROTOCOL_IDX = 7-1
    protol  = data[PROTOCOL_IDX]
    dstport = data[DST_PORT_IDX]
    if '17' == protol and ('53' == dstport):
    DNS_QUERY_NAME_IDX = 55-1 # domain
    if (len(data) < 55):
        print "error line:"
        print log
        return ("", "")
    domain = data[DNS_QUERY_NAME_IDX]
    mdomain, subdomain = extract_domain(domain)
    return (mdomain, subdomain)
    else:
    print "error line not a DNS:"
    print log
    return ("", "")


 
#处理域名的最小长度
MIN_LEN=1
 
#状态个数
N=8
#最大似然概率阈值
T=-50
 
#模型文件名
FILE_MODEL="hmm-cdn.m"
FILE_MODEL2 ="hmm-cdn-white.pkl"
 

def get_cdn_domains(dir_path):
    domain_list=[]
    for path in iterbrowse(dir_path):
    with open(path) as f:
        for line in f:
        mdomain, sub_domain = parse(line)
        if len(sub_domain) >= MIN_LEN:
            domain_list.append(sub_domain)
                if len(domain_list) >= 3000:
                    return domain_list
                #else:
                #    print path, "pass line:", line
    return  domain_list
    

def domain2ver(domain):
    ver=[]
    for i in range(0,len(domain)):
        ver.append([ord(domain[i])])
    return ver
 

def train_hmm(domain_list):
    if os.path.exists(FILE_MODEL2):
        print "found model file, use it..."
        file_model = open(FILE_MODEL2, 'rb')
        model = pickle.load(file_model)
        file_model.close()
        return model

    X = [[0]]
    X_lens = [1]
    for domain in domain_list:
        ver=domain2ver(domain)
        np_ver = np.array(ver)
        #print len(np_ver)
        try:
        X=np.concatenate([X,np_ver])
    except ValueError:
            print domain
            print len(X), len(np_ver)
            print X
            print np_ver
            raise
        X_lens.append(len(np_ver))
 
    #remodel = hmm.GaussianHMM(n_components=N, covariance_type="spherical", n_iter=500) #spherical, diag full,tied
    remodel = hmm.GaussianHMM(n_components=N, covariance_type="full", n_iter=500)
    remodel.fit(X,X_lens)
    #joblib.dump(remodel, FILE_MODEL)

    file_model = open(FILE_MODEL2, 'wb')
    pickle.dump(remodel, file_model)
    file_model.close()
 
    return remodel
 

def test(remodel, domain_list):
    x=[]
    y=[]
    for domain in domain_list:
        domain_ver=domain2ver(domain)
        np_ver = np.array(domain_ver)
        pro = remodel.score(np_ver)
        print  "SCORE:(%d) DOMAIN:(%s) " % (pro, domain)
        x.append(len(domain))
        y.append(pro)
    return x,y
 
 
if __name__ == '__main__':
    domain_list = get_cdn_domains("/home/bonelee/latest_metadata_sample/labeled_cdn")
    domain_list2 = get_cdn_domains("/home/bonelee/latest_metadata_sample/labeled_white_like")
    #remodel=train_hmm(domain_list)
    remodel=train_hmm(domain_list+domain_list2)
    #remodel=joblib.load(FILE_MODEL)

    x_1,y_1=test(remodel, domain_list)
    print x_1
    print y_1
    #sys.exit(0)
    domain_list = get_cdn_domains("/home/bonelee/latest_metadata_sample/labeled_black")
    x_2,y_2=test(remodel, domain_list)
    print x_2
    print y_2
    domain_list = get_cdn_domains("/home/bonelee/latest_metadata_sample/labeled_white_like")
    x_3,y_3=test(remodel, domain_list)
    print x_3
    print y_3
    #%matplotlib inline
    fig,ax=plt.subplots()
    ax.set_xlabel('Domain Length')
    ax.set_ylabel('HMM Score')
    #ax.scatter(x_3,y_3,color='b',label="WHITE")
    ax.scatter(x_2, y_2, color='g', label="DNS tunnel")
    ax.scatter(x_1, y_1, color='r', label="CDN")
    ax.legend(loc='right')
    plt.show()
复制代码

 其中:X = [[0]],X_lens = [] 也可以按照下面方式进行读写。除去了冗余的初始化。

复制代码
def train_hmm(domain_list):
    if os.path.exists(FILE_MODEL2):
        print "found model file, use it..."
        file_model = open(FILE_MODEL2, 'rb')
        model = pickle.load(file_model)
        file_model.close()
        return model

    #X = [[0]]
    #X_lens = [1]
    X = []
    X_lens = []
    #print X
    for domain in domain_list:
        ver=domain2ver(domain)
        #np_ver = np.array(ver)
        try:
            #X=np.concatenate([X,np_ver])
            X = X + ver
        except ValueError:
            print domain
            print X
            print ver
            raise
        X_lens.append(len(ver))
    #remodel = hmm.GaussianHMM(n_components=N, covariance_type="spherical", n_iter=500) #spherical, diag full,tied
    remodel = hmm.GaussianHMM(n_components=N, covariance_type="full", n_iter=500)
    remodel.fit(X,X_lens)
    #joblib.dump(remodel, FILE_MODEL)

    file_model = open(FILE_MODEL2, 'wb')
    pickle.dump(remodel, file_model)
    file_model.close()

    return remodel
复制代码

 















本文转自张昺华-sky博客园博客,原文链接:http://www.cnblogs.com/bonelee/p/7986678.html,如需转载请自行联系原作者



相关文章
|
机器学习/深度学习 人工智能 算法
NeurIPS 2021 | CDN:首个融合two-stage和one-stage思想的HOI检测方法
NeurIPS 2021 | CDN:首个融合two-stage和one-stage思想的HOI检测方法
NeurIPS 2021 | CDN:首个融合two-stage和one-stage思想的HOI检测方法
|
运维 算法 监控
日志服务在CDN流量指标中的异常检测实战(下)
阿里云开发工程师悟冥带来日志服务在CDN流量指标中的异常检测实战的分享,悟冥负责日志服务平台中时序异常和时序预测的算法研发,提升系统发现异常、处理异常的能力,致力于让算法更好的协助开发和运维人员。本次专题围绕以实时的视频流媒体数据为例,介绍常用时序异常检测算法、异常维度分析的具体应用。
2586 0
|
7月前
|
缓存 前端开发 JavaScript
适合阿里云CDN分发的文件类型有哪些?
静态文件如网页、图片、视频等适合CDN分发,可提升加载速度,减轻源站压力。动态、私有或频繁变更内容则不适合。合理选择资源包,助力高效上云。
|
7月前
|
CDN
阿里云CDN计费价格如何收费的?一文看懂
阿里云CDN计费包含基础费用与增值服务。基础费用可选按流量、带宽峰值或月结95带宽计费,默认按流量计费;增值服务如HTTPS、QUIC、WAF、实时日志等按使用量收费,不使用不计费。支持资源包抵扣,详情参考官方文档。
801 10
|
7月前
|
缓存 监控 安全
如何设置阿里云CDN的流量阈值以避免超额费用?
在信息爆炸时代,阿里云CDN助力网站加速。合理设置CDN阈值可提升性能、节省带宽、增强安全。本文详解阈值配置步骤与监控优化,助你高效利用资源。无账号者可通过翼龙云上云,享技术支持与优惠。
|
7月前
|
缓存 前端开发 JavaScript
有哪些文件适合阿里云CDN分发?
静态、高频访问且对加载速度要求高的文件(如网页、图片、视频、下载文件)适合CDN加速,可提升性能与性价比;动态内容、私有数据、频繁更新或敏感资源则不宜使用。合理选择分发策略,优化体验并降低成本。
|
7月前
|
CDN
怎么调整阿里云CDN配额?
阿里云CDN助力网站加速,配额管理关乎性能与成本。本文详解带宽、流量、请求数配额定义,指导用户查看及申请提升配额,并解析费用影响与优化策略,助您合理规划资源,降本增效。
|
7月前
|
域名解析 缓存 监控
阿里云渠道商:如何排查阿里云CDN访问问题?
排查阿里云CDN访问问题需分步诊断:先检查DNS解析与CDN基础配置,再根据访问慢、内容未更新或50x错误等现象定位原因,结合ping、tracert及阿里云实时日志、节点检测等工具深度分析。掌握此流程,快速解决异常。
|
7月前
|
边缘计算 缓存 双11
阿里云渠道商:什么时候应该使用阿里云 CDN 预热?
阿里云CDN预热可将资源提前分发至边缘节点,降低首字节时间50%以上,减轻源站压力。适用于大促活动、大文件发布、定期更新、突发流量及APP资源更新等场景,提升访问速度与稳定性。首次访问求快用预热,内容更新生效用刷新。