社交网络影响力最大化——贪心算法实现(Python实现)
环境: Win7 Pycharm Anaconda2
该算法每个节点的阈值设为 0.5
linear_threshold_clime.py(LT传播模型算法)
# -*- coding: utf-8 -*-
"""
Implement linear threshold models
"""
import copy
import itertools
import random
import math
import networkx as nx
__all__ = ['linear_threshold']
#-------------------------------------------------------------------------
# Some Famous Diffusion Models
#-------------------------------------------------------------------------
def linear_threshold(G, seeds, steps=0):
"""
Parameters
----------
G : networkx graph
The number of nodes.
seeds: list of nodes
The seed nodes of the graph
steps: int
The number of steps to diffuse
When steps <= 0, the model diffuses until no more nodes
can be activated
Return
------
layer_i_nodes : list of list of activated nodes
layer_i_nodes[0]: the seeds
layer_i_nodes[k]: the nodes activated at the kth diffusion step
Notes
-----
1. Each node is supposed to have an attribute "threshold". If not, the
default value is given (0.5).
2. Each edge is supposed to have an attribute "influence". If not, the
default value is given (out_deg / out_deg_sum)
"""
if type(G) == nx.MultiGraph or type(G) == nx.MultiDiGraph:
raise Exception( \
"linear_threshold() is not defined for graphs with multiedges.")
# make sure the seeds are in the graph
for s in seeds:
if s not in G.nodes():
raise Exception("seed", s, "is not in graph")
# change to directed graph
if not G.is_directed():
#DG = G.to_directed()
DG=nx.DiGraph(G)
else:
DG = copy.deepcopy(G)
# init thresholds
for n in DG.nodes():
if 'threshold' not in DG.node[n]:
DG.node[n]['threshold'] = 0.5
elif DG.node[n]['threshold'] > 1:
raise Exception("node threshold:", DG.node[n]['threshold'], \
"cannot be larger than 1")
# init influences
# in_deg_all = DG.in_degree() #获取所有节点的入度
out_deg_all=DG.out_degree() #获取所有节点的出度
in_edges_all=DG.in_edges() #获取所有的入边
for e in DG.edges(): #对所有的边进行循环
if 'influence' not in DG[e[0]][e[1]]:
out_deg=out_deg_all[e[0]] #获取节点e[0]的出度
in_edges=in_edges_all._adjdict[e[1]] #获取节点e[1]的所有的入边
edges_dict=dict(in_edges)
in_all_edges=list(edges_dict.keys()) #获取节点e[1]的所有入边节点并存入列表
out_deg_sum=0
for i in in_all_edges: #求节点e[1]所有入边节点的出度和
out_deg_sum=out_deg_sum+out_deg_all[i]
DG[e[0]][e[1]]['influence'] = out_deg / out_deg_sum
elif DG[e[0]][e[1]]['influence'] > 1:
raise Exception("edge influence:", DG[e[0]][e[1]]['influence'], \
"cannot be larger than 1")
# perform diffusion
A = copy.deepcopy(seeds)
if steps <= 0:
# perform diffusion until no more nodes can be activated
return _diffuse_all(DG, A)
# perform diffusion for at most "steps" rounds only
return _diffuse_k_rounds(DG, A, steps)
def _diffuse_all(G, A):
layer_i_nodes = [ ]
layer_i_nodes.append([i for i in A])
while True:
len_old = len(A)
A, activated_nodes_of_this_round = _diffuse_one_round(G, A)
layer_i_nodes.append(activated_nodes_of_this_round)
if len(A) == len_old:
break
return layer_i_nodes
def _diffuse_k_rounds(G, A, steps):
layer_i_nodes = [ ]
layer_i_nodes.append([i for i in A])
while steps > 0 and len(A) < len(G):
len_old = len(A)
A, activated_nodes_of_this_round = _diffuse_one_round(G, A)
layer_i_nodes.append(activated_nodes_of_this_round)
if len(A) == len_old:
break
steps -= 1
return layer_i_nodes
def _diffuse_one_round(G, A):
activated_nodes_of_this_round = set()
for s in A:
nbs = G.successors(s)
for nb in nbs:
if nb in A:
continue
active_nb = list(set(G.predecessors(nb)).intersection(set(A)))
if _influence_sum(G, active_nb, nb) >= G.node[nb]['threshold']:
activated_nodes_of_this_round.add(nb)
A.extend(list(activated_nodes_of_this_round))
return A, list(activated_nodes_of_this_round)
def _influence_sum(G, froms, to):
influence_sum = 0.0
for f in froms:
influence_sum += G[f][to]['influence']
return influence_sum
test_linear_threshold_clime.py
#!/usr/bin/env python
# coding=UTF-8
from nose.tools import *
from networkx import *
from linear_threshold_clime import *
"""Test Diffusion Models
----------------------------
"""
#贪心算法
def _linear_clime(G,k): #参数k-表示要获取的子节点的个数
allnodes = G.nodes() #获取图中所有节点数
seed_nodes = [] #该列表存储选取的子节点集
for m in range(k):
all_nodes = list(allnodes) #将所有的节点存储在all_nodes列表里
layers_activite = [] # 存储每个节点的激活节点列表
lengths = [] # 存储每个节点的激活列表长度
datas = []
for i in all_nodes: #遍历所有的节点,分别求出每个节点对应的激活节点集以及激活节点集的长度
data = []
data.append(i)
datas.append(i)
data_test=seed_nodes+data
layers = linear_threshold(G,data_test)
data.pop()
del layers[-1]
length = 0
layer_data = []
for j in range(len(layers)):
length = length + len(layers[j])
layer_data = layer_data + layers[j]
length_s = length - len(layers[0])
for s in range(len(layers[0])):
del layer_data[0]
layers_activite.append(layer_data)
lengths.append(length_s)
# length_max = max(lengths) # 获取激活节点最多的节点数
layers_max = layers_activite[lengths.index(max(lengths))] # 获取被激活节点数最多的列表
seed_nodes.append(datas[lengths.index(max(lengths))]) # 获取被激活节点最多的子节点
for i in all_nodes: #该循环是删除所有节点中seed_nodes节点集
if i in seed_nodes:
del all_nodes[all_nodes.index(i)]
allnodes=all_nodes
return seed_nodes,layers_max #返回值是贪心算法求得的子节点集和该子节点集激活的最大节点集
#测试算法
if __name__=='__main__':
datasets=[]
f=open("Wiki-Vote.txt")
data=f.read()
rows=data.split('\n')
for row in rows:
split_row=row.split('\t')
name=(int(split_row[0]),int(split_row[1]))
datasets.append(name)
G=networkx.DiGraph()
G.add_edges_from(datasets) #根据数据集创建有向图
n=input('Please input the number of seeds: k=')
k=int(n)
seed_nodes, layers_max=_linear_clime(G,k) #调用贪心算法获取节点子集和节点子集的最大激活节点集
print(seed_nodes)
print(layers_max)
运行结果
LT_improve.py(LT模型改进算法)
#!/usr/bin/env python
# coding=UTF-8
from nose.tools import *
from networkx import *
from linear_threshold_clime import *
from linear_threshold import *
import math
#计算图中边的权重
def Buv_calculate(G,u,v):
out_deg_all = G.out_degree() # 获取所有节点的出度
in_edges_all = G.in_edges() # 获取所有的入边
out_deg = out_deg_all[u] # 获取节点e[0]的出度
in_edges = in_edges_all._adjdict[v] # 获取节点e[1]的所有的入边
edges_dict = dict(in_edges)
in_all_edges = list(edges_dict.keys()) # 获取节点e[1]的所有入边节点并存入列表
out_deg_sum = 0
for i in in_all_edges: # 求节点e[1]所有入边节点的出度和
out_deg_sum = out_deg_sum + out_deg_all[i]
return out_deg / out_deg_sum
#计算每个节点AP的值
def AP_calculate(node):
data = []
data.append(node)
layer_two_nodes = linear_threshold(G, data, 2) # 获取每个节点的两层出度节点数
data.pop()
del layer_two_nodes[-1]
length = 0
for i in range(len(layer_two_nodes)):
length = length + len(layer_two_nodes[i])
lengths = length - len(layer_two_nodes[0])
out_edges = out_edges_all._adjdict[node] # 获得节点的出边
edges_dict = dict(out_edges)
out_all_edges = list(edges_dict.keys()) # 将节点的所有出边存入列表
Buv_sum = 0
for out_edge in out_all_edges: # 计算该节点所有出边的Buv的值
Buv = Buv_calculate(G, node, out_edge)
Buv_sum = Buv_sum + Buv
cha_sum = 1 + math.e ** (-Buv_sum)
AP = lengths + cha_sum
return AP
def select_layers(G,node_list_sorted,k1): #选择前k/2个节点的算法实现
seed_nodes = [] # 存贮种子节点
for i in range(k1):
data = []
data.append(node_list_sorted[0][0])
seed_nodes.append(node_list_sorted[0][0])
layers = linear_threshold(G, data) # 使用LT算法
data.pop()
del layers[-1]
layers_activate = []
for i in layers: # 将种子节点和激活的节点存入layers_activate列表
for j in i:
layers_activate.append(j)
for m in node_list_sorted: # 删除node_list_sorted中的layers_activate
for n in layers_activate:
if m[0] == n:
node_list_sorted.remove(m)
return seed_nodes,node_list_sorted
def _select_others(seed_nodes, other_nodes,k2): #贪心算法选择剩余k/2个节点
for m in range(k2):
all_nodes = list(other_nodes) #将所有的节点存储在all_nodes列表里
layers_activite = [] # 存储每个节点的激活节点列表
lengths = [] # 存储每个节点的激活列表长度
datas = []
for i in all_nodes: #遍历所有的节点,分别求出每个节点对应的激活节点集以及激活节点集的长度
data = []
data.append(i)
datas.append(i)
data_test=seed_nodes+data
layers = linear_threshold(G,data_test)
data.pop()
del layers[-1]
length = 0
layer_data = []
for j in range(len(layers)):
length = length + len(layers[j])
layer_data = layer_data + layers[j]
length_s = length - len(layers[0])
for s in range(len(layers[0])):
del layer_data[0]
layers_activite.append(layer_data)
lengths.append(length_s)
layers_max = layers_activite[lengths.index(max(lengths))] # 获取被激活节点数最多的列表
seed_nodes.append(datas[lengths.index(max(lengths))]) # 获取被激活节点最多的子节点
for i in all_nodes: #该循环是删除所有节点中seed_nodes节点集
if i in seed_nodes:
del all_nodes[all_nodes.index(i)]
other_nodes=all_nodes
return seed_nodes,layers_max #返回值是贪心算法求得的子节点集和该子节点集激活的最大节点集
if __name__=='__main__':
datasets=[]
f=open("Wiki-Vote.txt")
data=f.read()
rows=data.split('\n')
for row in rows:
split_row=row.split('\t')
name=(int(split_row[0]),int(split_row[1]))
datasets.append(name)
G=networkx.DiGraph()
G.add_edges_from(datasets) #根据数据集创建有向图
allnodes=G.nodes() #获取图中所有的节点
all_nodes=list(allnodes)
out_edges_all = G.out_edges() # 获取所有节点的出边
node_dict={} #将节点和节点对应的AP值存入字典
for node in all_nodes: #遍历所有节点获得每个节点的AP值
AP=AP_calculate(node)
node_dict[node]=AP
node_list_sorted=sorted(node_dict.items(),key=lambda d:d[1],reverse=True) #对字典按AP值进行由大到小排序
'''
f=open('data.txt','r')
data=f.read()
node_list_sorted=list(data)
'''
k=input('Please input inter of k=')
seed_nodes, node_list_sorted=select_layers(G,node_list_sorted,k/2)
other_nodes=[]
'''
for i in range(len(node_list_sorted)):
other_nodes.append(node_list_sorted[i][0])
'''
for i in seed_nodes:
if i in all_nodes:
all_nodes.remove(i)
other_nodes=all_nodes
seed_nodes, layers_max=_select_others(seed_nodes,other_nodes,k/2)
layer=linear_threshold(G,seed_nodes)
lenth=len(layers_max)
print(seed_nodes)
print(layers_max)
print(lenth)
print(layer)
运行结果: