hmm CDN检测

简介:
复制代码
# -*- coding:utf-8 -*-
 
import sys
import re
from hmmlearn import hmm
import numpy as np
from sklearn.externals import joblib
import matplotlib.pyplot as plt
import tldextract
import os


def iterbrowse(path):          
    for home, dirs, files in os.walk(path): 
        for filename in files: 
            yield os.path.join(home, filename)


def extract_domain(domain):
    suffix = {'.com','.la','.io', '.co', '.cn','.info', '.net', '.org','.me', '.mobi', '.us', '.biz', '.xxx', '.ca', '.co.jp', '.com.cn', '.net.cn', '.org.cn', '.mx','.tv', '.ws', '.ag', '.com.ag', '.net.ag', '.org.ag','.am','.asia', '.at', '.be', '.com.br', '.net.br', '.name', '.live', '.news', '.bz', '.tech', '.pub', '.wang', '.space', '.top', '.xin', '.social', '.date', '.site', '.red', '.studio', '.link', '.online', '.help', '.kr', '.club', '.com.bz', '.net.bz', '.cc', '.band', '.market', '.com.co', '.net.co', '.nom.co', '.lawyer', '.de', '.es', '.com.es', '.nom.es', '.org.es', '.eu', '.wiki', '.design', '.software', '.fm', '.fr', '.gs', '.in', '.co.in', '.firm.in', '.gen.in', '.ind.in', '.net.in', '.org.in', '.it', '.jobs', '.jp', '.ms', '.com.mx', '.nl','.nu','.co.nz','.net.nz', '.org.nz', '.se', '.tc', '.tk', '.tw', '.com.tw', '.idv.tw', '.org.tw', '.hk', '.co.uk', '.me.uk', '.org.uk', '.vg'}

    domain = domain.lower()
    names = domain.split(".")
    if len(names) >= 3: 
    if ("."+".".join(names[-2:])) in suffix:
        return ".".join(names[-3:]), ".".join(names[:-3]) 
    elif ("."+names[-1]) in suffix:
        return ".".join(names[-2:]), ".".join(names[:-2]) 
    print "New domain suffix found. Use tld extract domain..."

    pos = domain.rfind("/")
    if pos >= 0: # maybe subdomain contains /, for dns tunnel tool
        ext = tldextract.extract(domain[pos+1:])
        subdomain = domain[:pos+1] + ext.subdomain
    else: 
        ext = tldextract.extract(domain)
        subdomain = ext.subdomain
    if ext.suffix:
        mdomain = ext.domain + "." + ext.suffix
    else:
        mdomain = ext.domain
    return mdomain, subdomain
 

def parse(log):
    data = log.split('^')
    SRC_PORT_IDX = 5-1
    DST_PORT_IDX = 6-1
    PROTOCOL_IDX = 7-1
    protol  = data[PROTOCOL_IDX]
    dstport = data[DST_PORT_IDX]
    if '17' == protol and ('53' == dstport):
    DNS_QUERY_NAME_IDX = 55-1 # domain
    if (len(data) < 55):
        print "error line:"
        print log
        return ("", "")
    domain = data[DNS_QUERY_NAME_IDX]
    mdomain, subdomain = extract_domain(domain)
    return (mdomain, subdomain)
    else:
    print "error line not a DNS:"
    print log
    return ("", "")


 
#处理域名的最小长度
MIN_LEN=3
 
#状态个数
N=5
#最大似然概率阈值
T=-50
 
#模型文件名
FILE_MODEL="hmm-cdn.m"
 

def get_cdn_domains(dir_path):
    domain_list=[]
    for path in iterbrowse(dir_path):
    with open(path) as f:
        for line in f:
        mdomain, sub_domain = parse(line)
        if len(sub_domain) >= MIN_LEN:
            domain_list.append(sub_domain)
                if len(domain_list) >= 2000:
                    return domain_list
                #else:
                #    print path, "pass line:", line
    return  domain_list
    

def domain2ver(domain):
    ver=[]
    for i in range(0,len(domain)):
        ver.append([ord(domain[i])])
    return ver
 

def train_hmm(domain_list):
    X = [[0]]
    X_lens = [1]
    for domain in domain_list:
        ver=domain2ver(domain)
        np_ver = np.array(ver)
        #print len(np_ver)
        try:
        X=np.concatenate([X,np_ver])
    except ValueError:
            print domain
            print len(X), len(np_ver)
            print X
            print np_ver
            raise
        X_lens.append(len(np_ver))
 
    remodel = hmm.GaussianHMM(n_components=N, covariance_type="full", n_iter=100)
    remodel.fit(X,X_lens)
    joblib.dump(remodel, FILE_MODEL)
 
    return remodel
 

def test(remodel, domain_list):
    x=[]
    y=[]
    for domain in domain_list:
        domain_ver=domain2ver(domain)
        np_ver = np.array(domain_ver)
        pro = remodel.score(np_ver)
        print  "SCORE:(%d) DOMAIN:(%s) " % (pro, domain)
        x.append(len(domain))
        y.append(pro)
    return x,y
 
 
if __name__ == '__main__':
    domain_list = get_cdn_domains("/home/bonelee/latest_metadata_sample/labeled_cdn")
    remodel=train_hmm(domain_list)
    remodel=joblib.load(FILE_MODEL)

    x_1,y_1=test(remodel, domain_list)
    print x_1
    print y_1
    #sys.exit(0)
    domain_list = get_cdn_domains("/home/bonelee/latest_metadata_sample/labeled_black")
    x_2,y_2=test(remodel, domain_list)
    print x_2
    print y_2
    domain_list = get_cdn_domains("/home/bonelee/latest_metadata_sample/labeled_white_like")
    x_3,y_3=test(remodel, domain_list)
    print x_3
    print y_3
    #%matplotlib inline
    fig,ax=plt.subplots()
    ax.set_xlabel('Domain Length')
    ax.set_ylabel('HMM Score')
    ax.scatter(x_3,y_3,color='b',label="WHITE")
    ax.scatter(x_2, y_2, color='g', label="BLACK")
    ax.scatter(x_1, y_1, color='r', label="CDN")
    ax.legend(loc='right')
    plt.show()
复制代码

 

使用pickle保存和加载模型:

复制代码
# -*- coding:utf-8 -*-
 
import sys
import re
from hmmlearn import hmm
import numpy as np
#from sklearn.externals import joblib
import matplotlib.pyplot as plt
import tldextract
import os
import pickle

def iterbrowse(path):          
    for home, dirs, files in os.walk(path): 
        for filename in files: 
            yield os.path.join(home, filename)


def extract_domain(domain):
    suffix = {'.com','.la','.io', '.co', '.cn','.info', '.net', '.org','.me', '.mobi', '.us', '.biz', '.xxx', '.ca', '.co.jp', '.com.cn', '.net.cn', '.org.cn', '.mx','.tv', '.ws', '.ag', '.com.ag', '.net.ag', '.org.ag','.am','.asia', '.at', '.be', '.com.br', '.net.br', '.name', '.live', '.news', '.bz', '.tech', '.pub', '.wang', '.space', '.top', '.xin', '.social', '.date', '.site', '.red', '.studio', '.link', '.online', '.help', '.kr', '.club', '.com.bz', '.net.bz', '.cc', '.band', '.market', '.com.co', '.net.co', '.nom.co', '.lawyer', '.de', '.es', '.com.es', '.nom.es', '.org.es', '.eu', '.wiki', '.design', '.software', '.fm', '.fr', '.gs', '.in', '.co.in', '.firm.in', '.gen.in', '.ind.in', '.net.in', '.org.in', '.it', '.jobs', '.jp', '.ms', '.com.mx', '.nl','.nu','.co.nz','.net.nz', '.org.nz', '.se', '.tc', '.tk', '.tw', '.com.tw', '.idv.tw', '.org.tw', '.hk', '.co.uk', '.me.uk', '.org.uk', '.vg'}

    domain = domain.lower()
    names = domain.split(".")
    if len(names) >= 3: 
    if ("."+".".join(names[-2:])) in suffix:
        return ".".join(names[-3:]), ".".join(names[:-3]) 
    elif ("."+names[-1]) in suffix:
        return ".".join(names[-2:]), ".".join(names[:-2]) 
    print "New domain suffix found. Use tld extract domain..."

    pos = domain.rfind("/")
    if pos >= 0: # maybe subdomain contains /, for dns tunnel tool
        ext = tldextract.extract(domain[pos+1:])
        subdomain = domain[:pos+1] + ext.subdomain
    else: 
        ext = tldextract.extract(domain)
        subdomain = ext.subdomain
    if ext.suffix:
        mdomain = ext.domain + "." + ext.suffix
    else:
        mdomain = ext.domain
    return mdomain, subdomain
 

def parse(log):
    data = log.split('^')
    SRC_PORT_IDX = 5-1
    DST_PORT_IDX = 6-1
    PROTOCOL_IDX = 7-1
    protol  = data[PROTOCOL_IDX]
    dstport = data[DST_PORT_IDX]
    if '17' == protol and ('53' == dstport):
    DNS_QUERY_NAME_IDX = 55-1 # domain
    if (len(data) < 55):
        print "error line:"
        print log
        return ("", "")
    domain = data[DNS_QUERY_NAME_IDX]
    mdomain, subdomain = extract_domain(domain)
    return (mdomain, subdomain)
    else:
    print "error line not a DNS:"
    print log
    return ("", "")


 
#处理域名的最小长度
MIN_LEN=1
 
#状态个数
N=8
#最大似然概率阈值
T=-50
 
#模型文件名
FILE_MODEL="hmm-cdn.m"
FILE_MODEL2 ="hmm-cdn-white.pkl"
 

def get_cdn_domains(dir_path):
    domain_list=[]
    for path in iterbrowse(dir_path):
    with open(path) as f:
        for line in f:
        mdomain, sub_domain = parse(line)
        if len(sub_domain) >= MIN_LEN:
            domain_list.append(sub_domain)
                if len(domain_list) >= 3000:
                    return domain_list
                #else:
                #    print path, "pass line:", line
    return  domain_list
    

def domain2ver(domain):
    ver=[]
    for i in range(0,len(domain)):
        ver.append([ord(domain[i])])
    return ver
 

def train_hmm(domain_list):
    if os.path.exists(FILE_MODEL2):
        print "found model file, use it..."
        file_model = open(FILE_MODEL2, 'rb')
        model = pickle.load(file_model)
        file_model.close()
        return model

    X = [[0]]
    X_lens = [1]
    for domain in domain_list:
        ver=domain2ver(domain)
        np_ver = np.array(ver)
        #print len(np_ver)
        try:
        X=np.concatenate([X,np_ver])
    except ValueError:
            print domain
            print len(X), len(np_ver)
            print X
            print np_ver
            raise
        X_lens.append(len(np_ver))
 
    #remodel = hmm.GaussianHMM(n_components=N, covariance_type="spherical", n_iter=500) #spherical, diag full,tied
    remodel = hmm.GaussianHMM(n_components=N, covariance_type="full", n_iter=500)
    remodel.fit(X,X_lens)
    #joblib.dump(remodel, FILE_MODEL)

    file_model = open(FILE_MODEL2, 'wb')
    pickle.dump(remodel, file_model)
    file_model.close()
 
    return remodel
 

def test(remodel, domain_list):
    x=[]
    y=[]
    for domain in domain_list:
        domain_ver=domain2ver(domain)
        np_ver = np.array(domain_ver)
        pro = remodel.score(np_ver)
        print  "SCORE:(%d) DOMAIN:(%s) " % (pro, domain)
        x.append(len(domain))
        y.append(pro)
    return x,y
 
 
if __name__ == '__main__':
    domain_list = get_cdn_domains("/home/bonelee/latest_metadata_sample/labeled_cdn")
    domain_list2 = get_cdn_domains("/home/bonelee/latest_metadata_sample/labeled_white_like")
    #remodel=train_hmm(domain_list)
    remodel=train_hmm(domain_list+domain_list2)
    #remodel=joblib.load(FILE_MODEL)

    x_1,y_1=test(remodel, domain_list)
    print x_1
    print y_1
    #sys.exit(0)
    domain_list = get_cdn_domains("/home/bonelee/latest_metadata_sample/labeled_black")
    x_2,y_2=test(remodel, domain_list)
    print x_2
    print y_2
    domain_list = get_cdn_domains("/home/bonelee/latest_metadata_sample/labeled_white_like")
    x_3,y_3=test(remodel, domain_list)
    print x_3
    print y_3
    #%matplotlib inline
    fig,ax=plt.subplots()
    ax.set_xlabel('Domain Length')
    ax.set_ylabel('HMM Score')
    #ax.scatter(x_3,y_3,color='b',label="WHITE")
    ax.scatter(x_2, y_2, color='g', label="DNS tunnel")
    ax.scatter(x_1, y_1, color='r', label="CDN")
    ax.legend(loc='right')
    plt.show()
复制代码

 其中:X = [[0]],X_lens = [] 也可以按照下面方式进行读写。除去了冗余的初始化。

复制代码
def train_hmm(domain_list):
    if os.path.exists(FILE_MODEL2):
        print "found model file, use it..."
        file_model = open(FILE_MODEL2, 'rb')
        model = pickle.load(file_model)
        file_model.close()
        return model

    #X = [[0]]
    #X_lens = [1]
    X = []
    X_lens = []
    #print X
    for domain in domain_list:
        ver=domain2ver(domain)
        #np_ver = np.array(ver)
        try:
            #X=np.concatenate([X,np_ver])
            X = X + ver
        except ValueError:
            print domain
            print X
            print ver
            raise
        X_lens.append(len(ver))
    #remodel = hmm.GaussianHMM(n_components=N, covariance_type="spherical", n_iter=500) #spherical, diag full,tied
    remodel = hmm.GaussianHMM(n_components=N, covariance_type="full", n_iter=500)
    remodel.fit(X,X_lens)
    #joblib.dump(remodel, FILE_MODEL)

    file_model = open(FILE_MODEL2, 'wb')
    pickle.dump(remodel, file_model)
    file_model.close()

    return remodel
复制代码

 















本文转自张昺华-sky博客园博客,原文链接:http://www.cnblogs.com/bonelee/p/7986678.html,如需转载请自行联系原作者



相关文章
|
机器学习/深度学习 人工智能 算法
NeurIPS 2021 | CDN:首个融合two-stage和one-stage思想的HOI检测方法
NeurIPS 2021 | CDN:首个融合two-stage和one-stage思想的HOI检测方法
NeurIPS 2021 | CDN:首个融合two-stage和one-stage思想的HOI检测方法
|
2月前
|
域名解析 网络协议 安全
阿里云CDN
本文介绍阿里云CDN产品中涉及的基本概念,便于您更准确地理解和使用CDN产品。
61 5
|
8月前
|
弹性计算 缓存 运维
【运维知识进阶篇】用阿里云部署kod可道云网盘(DNS解析+CDN缓存+Web应用防火墙+弹性伸缩)(三)
【运维知识进阶篇】用阿里云部署kod可道云网盘(DNS解析+CDN缓存+Web应用防火墙+弹性伸缩)(三)
136 0
|
1月前
|
安全 网络安全 CDN
阿里云CDN HTTPS 证书配置流程
阿里云CDN HTTPS 证书配置流程
174 1
|
8月前
|
弹性计算 缓存 运维
【运维知识进阶篇】用阿里云部署kod可道云网盘(DNS解析+CDN缓存+Web应用防火墙+弹性伸缩)(二)
【运维知识进阶篇】用阿里云部署kod可道云网盘(DNS解析+CDN缓存+Web应用防火墙+弹性伸缩)(二)
132 0
|
8月前
|
缓存 弹性计算 运维
【运维知识进阶篇】用阿里云部署kod可道云网盘(DNS解析+CDN缓存+Web应用防火墙+弹性伸缩)(一)
【运维知识进阶篇】用阿里云部署kod可道云网盘(DNS解析+CDN缓存+Web应用防火墙+弹性伸缩)
159 0
|
9月前
|
缓存 前端开发 安全
阿里云CDN简介和优惠购买流程
阿里云CDN,首先需要了解什么是CDN。CDN全称是Content Delivery Network,即内容分发网络。CDN可以通过地理位置分发策略,使用户就近访问到离他们最近的服务器,从而提高访问速度,同时也能减轻源站流量压力,确保整个网站的稳定性。 阿里云CDN是阿里云推出的全球服务,涵盖全球范围内1250+节点。阿里云CDN采用的是BGP多线路中转技术,保证全局加速,让网络传输更加稳定。阿里云CDN可以帮助用户提高网站的访问速度,从而提升用户体验。同时,阿里云CDN还有许多其他的优势。
|
10月前
|
存储 缓存 边缘计算
阿里云CDN简介和使用流程
阿里云内容分发网络CDN(Content Delivery Network)是建立并覆盖在承载网之上,由遍布全球的边缘节点服务器群组成的分布式网络。阿里云CDN能分担源站压力,避免网络拥塞,确保在不同区域、不同场景下加速网站内容的分发,提高资源访问速度。(1)全网带宽输出能力达150 Tbps。(2)把静态内容缓存到边缘节点提高访问下载效率。
2075 0
|
11月前
|
边缘计算 CDN
《阿里云产品手册2022-2023 版》——CDN与边缘计算
《阿里云产品手册2022-2023 版》——CDN与边缘计算
171 0