Rachunek teoretyczny prawdopodobieńst [hipoteza]

$C_{ij} = \frac{[i \in j-tej chmury]} {ile\ chmur\ ma\ i}$

$D_{ij} = \frac{[j \in i-tej chmury]} {ile\ wierzchołków\ ma\ j}$

A = DC

In [1]:
%matplotlib inline

Deprecated

todo

  • document properly what happens here
  • write about assumptions
  • write about conclusions
  • how can I improve this pipeline?
In [2]:
import numpy as np
import networkx as nx
from collections import Counter
from matplotlib import pyplot as plt
from numpy.random import random_sample
from hypergraph.generators import uniform_hypergraph
from hypergraph import converters
from hypergraph.markov_diffusion import create_markov_matrix
from hypergraph import utils


def create_graph(number_of_nodes, cardinality, fraction_of_hyperedges):
    HG = uniform_hypergraph(
        n=number_of_nodes,
        k=cardinality,
        number_of_edges=int(
            number_of_nodes *
            fraction_of_hyperedges))

    nodes = HG.nodes()
    hyper_edges = HG.hyper_edges()
    markov_matrix = create_markov_matrix(hyper_edges)
    current_state = 1
    return HG
In [3]:
def analytical_solution_of_diffusion(nodes, hyperedges, plot_results=False):
    all_nodes = []
    for hyperedge in hyperedges:
        all_nodes += hyperedge
        
    c = Counter(all_nodes)
    for node in nodes:
        if node not in all_nodes:
            c[node]=0
            
    xs, ys = zip(*c.items())
    
    sum_of_ys = sum(ys)
    ys = [float(y) / sum_of_ys for y in ys]
    
    if plot_results:
        plt.bar(xs, ys)
        plt.title("Analytical prediction on diffusion on hypergraph")
        
    return ys
In [4]:
HG = create_graph(10, 3, 0.7)

utils.plot_different_representations(HG.nodes(), HG.hyper_edges())
Drawing different representations of hypergraph
Bipartite graph
Graph of hypereges as nodes
Clique graph
In [5]:
analytical_solution_of_diffusion(HG.nodes(), HG.hyper_edges())
Out[5]:
[0.0,
 0.09523809523809523,
 0.09523809523809523,
 0.19047619047619047,
 0.14285714285714285,
 0.09523809523809523,
 0.09523809523809523,
 0.09523809523809523,
 0.14285714285714285,
 0.047619047619047616]
In [6]:
def compute_C(hypergraph):
    nodes = hypergraph.nodes()
    hyper_edges = hypergraph.hyper_edges()
    
    N = len(nodes)
    M = len(hyper_edges)
    C = np.zeros((N, M))
    for i, node in enumerate(nodes):
        for j, edge in enumerate(hyper_edges):
            if node in edge:
                C[i][j] = 1
                
    for i in range(N):
        if np.sum(C[i][:]):
            C[i] /= np.sum(C[i][:])
    return C

def compute_D(hypergraph):
    nodes = hypergraph.nodes()
    hyper_edges = hypergraph.hyper_edges()
    
    N = len(nodes)
    M = len(hyper_edges)
    D = np.zeros((M, N))
    for i, edge in enumerate(hyper_edges):
        for j, node in enumerate(nodes):
            if node in edge:
                D[i][j] = 1
    for i in range(M):
        if np.sum(D[i][:]):
            D[i] /= np.sum(D[i][:])
    return D
In [7]:
C = compute_C(HG)
In [9]:
D = compute_D(HG)
In [10]:
C
Out[10]:
array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ],
       [ 0.25      ,  0.        ,  0.25      ,  0.        ,  0.        ,
         0.25      ,  0.25      ],
       [ 0.        ,  0.        ,  0.33333333,  0.33333333,  0.33333333,
         0.        ,  0.        ],
       [ 0.33333333,  0.33333333,  0.        ,  0.        ,  0.        ,
         0.33333333,  0.        ],
       [ 0.        ,  0.33333333,  0.33333333,  0.        ,  0.        ,
         0.        ,  0.33333333],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.5       ,
         0.        ,  0.5       ],
       [ 0.        ,  0.5       ,  0.        ,  0.5       ,  0.        ,
         0.        ,  0.        ],
       [ 0.5       ,  0.        ,  0.        ,  0.5       ,  0.        ,
         0.        ,  0.        ]])
In [11]:
D
Out[11]:
array([[ 0.        ,  0.        ,  0.        ,  0.33333333,  0.        ,
         0.33333333,  0.        ,  0.        ,  0.        ,  0.33333333],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.33333333,  0.33333333,  0.        ,  0.33333333,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.33333333,  0.33333333,
         0.        ,  0.33333333,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.33333333,
         0.        ,  0.        ,  0.        ,  0.33333333,  0.33333333],
       [ 0.        ,  0.33333333,  0.        ,  0.        ,  0.33333333,
         0.        ,  0.        ,  0.33333333,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.33333333,  0.33333333,  0.        ,
         0.33333333,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.33333333,  0.        ,
         0.        ,  0.33333333,  0.33333333,  0.        ,  0.        ]])
In [12]:
np.dot(C, D)
Out[12]:
array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.33333333,  0.        ,  0.        ,  0.33333333,
         0.        ,  0.        ,  0.33333333,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.33333333,  0.33333333,  0.        ,
         0.33333333,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.08333333,  0.33333333,  0.08333333,
         0.16666667,  0.16666667,  0.08333333,  0.        ,  0.08333333],
       [ 0.        ,  0.11111111,  0.        ,  0.11111111,  0.33333333,
         0.        ,  0.11111111,  0.11111111,  0.11111111,  0.11111111],
       [ 0.        ,  0.        ,  0.11111111,  0.22222222,  0.        ,
         0.33333333,  0.11111111,  0.        ,  0.11111111,  0.11111111],
       [ 0.        ,  0.        ,  0.        ,  0.22222222,  0.11111111,
         0.11111111,  0.33333333,  0.11111111,  0.11111111,  0.        ],
       [ 0.        ,  0.16666667,  0.        ,  0.16666667,  0.16666667,
         0.        ,  0.16666667,  0.33333333,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.16666667,
         0.16666667,  0.16666667,  0.        ,  0.33333333,  0.16666667],
       [ 0.        ,  0.        ,  0.        ,  0.16666667,  0.16666667,
         0.16666667,  0.        ,  0.        ,  0.16666667,  0.33333333]])
In [13]:
%load_ext autoreload
%autoreload 2
In [14]:
markov_matrix = create_markov_matrix(HG.hyper_edges(), count_itself=False)
In [15]:
markov_matrix_with_itself = create_markov_matrix(HG.hyper_edges(), count_itself=True)
In [16]:
markov_matrix_with_itself
Out[16]:
array([[ 0.        ,  0.16666667,  0.16666667,  0.16666667,  0.        ,
         0.33333333,  0.16666667],
       [ 0.2       ,  0.        ,  0.2       ,  0.2       ,  0.        ,
         0.2       ,  0.2       ],
       [ 0.14285714,  0.14285714,  0.        ,  0.14285714,  0.14285714,
         0.14285714,  0.28571429],
       [ 0.25      ,  0.25      ,  0.25      ,  0.        ,  0.25      ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.25      ,  0.25      ,  0.25      ,
         0.        ,  0.25      ],
       [ 0.33333333,  0.16666667,  0.16666667,  0.        ,  0.        ,
         0.16666667,  0.16666667],
       [ 0.16666667,  0.16666667,  0.33333333,  0.        ,  0.16666667,
         0.16666667,  0.        ]])
In [17]:
markov_matrix
Out[17]:
array([[ 0.        ,  0.16666667,  0.16666667,  0.16666667,  0.        ,
         0.33333333,  0.16666667],
       [ 0.2       ,  0.        ,  0.2       ,  0.2       ,  0.        ,
         0.2       ,  0.2       ],
       [ 0.14285714,  0.14285714,  0.        ,  0.14285714,  0.14285714,
         0.14285714,  0.28571429],
       [ 0.25      ,  0.25      ,  0.25      ,  0.        ,  0.25      ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.33333333,  0.33333333,  0.        ,
         0.        ,  0.33333333],
       [ 0.4       ,  0.2       ,  0.2       ,  0.        ,  0.        ,
         0.        ,  0.2       ],
       [ 0.16666667,  0.16666667,  0.33333333,  0.        ,  0.16666667,
         0.16666667,  0.        ]])
In [18]:
np.dot(np.dot(np.dot(np.dot(markov_matrix, markov_matrix), markov_matrix), markov_matrix), markov_matrix)
Out[18]:
array([[ 0.15864414,  0.13472476,  0.19218626,  0.11842592,  0.07871192,
         0.14532802,  0.17197897],
       [ 0.16166972,  0.1348435 ,  0.19271618,  0.11746677,  0.07875497,
         0.1422268 ,  0.17232206],
       [ 0.16473108,  0.13765442,  0.19229482,  0.1135725 ,  0.08217065,
         0.14007713,  0.1694994 ],
       [ 0.17763888,  0.14683346,  0.19875188,  0.09810091,  0.09233917,
         0.13118372,  0.15515198],
       [ 0.15742385,  0.13125828,  0.19173151,  0.12311889,  0.07547052,
         0.14369381,  0.17730313],
       [ 0.17439363,  0.1422268 ,  0.19610798,  0.10494698,  0.08621629,
         0.13322011,  0.16288822],
       [ 0.17197897,  0.14360172,  0.1977493 ,  0.10343466,  0.08865157,
         0.13574018,  0.15884361]])
In [19]:
from numpy import linalg as LA

LA.matrix_power(markov_matrix_with_itself, 1011)
Out[19]:
array([[ 0.15789474,  0.13157895,  0.18421053,  0.10526316,  0.10526316,
         0.15789474,  0.15789474],
       [ 0.15789474,  0.13157895,  0.18421053,  0.10526316,  0.10526316,
         0.15789474,  0.15789474],
       [ 0.15789474,  0.13157895,  0.18421053,  0.10526316,  0.10526316,
         0.15789474,  0.15789474],
       [ 0.15789474,  0.13157895,  0.18421053,  0.10526316,  0.10526316,
         0.15789474,  0.15789474],
       [ 0.15789474,  0.13157895,  0.18421053,  0.10526316,  0.10526316,
         0.15789474,  0.15789474],
       [ 0.15789474,  0.13157895,  0.18421053,  0.10526316,  0.10526316,
         0.15789474,  0.15789474],
       [ 0.15789474,  0.13157895,  0.18421053,  0.10526316,  0.10526316,
         0.15789474,  0.15789474]])
In [20]:
LA.matrix_power(markov_matrix, 3)
len(markov_matrix)
Out[20]:
7
In [24]:
from hypergraph.markov_diffusion import DiffusionEngine, count_nodes
from hypergraph import utils
In [25]:
def simulate_diffusion(hypergraph, markov_matrix, t_max, current_state=1, plot_results=False):
    nodes = hypergraph.nodes()
    hyper_edges = hypergraph.hyper_edges()
    engine = DiffusionEngine(markov_matrix)
    most_common, states = engine.simulate(t_max)
    most_common_nodes = count_nodes(nodes, hyper_edges, most_common)
    if plot_results:
        plt.figure(figsize=(8, 4))
        utils.plot_hyperedges_frequencies(most_common, hyper_edges,
                                'Ocurrences of hyperedges in a hypergraph', normed=True)

        plt.figure(figsize=(8, 4))
        return utils.plot_nodes_frequencies(most_common_nodes, 'Nodes in a hypergraph', normed=True)
    else:
        return utils.get_names_and_occurrences(most_common_nodes)[1]
    
In [26]:
simulate_diffusion(HG, markov_matrix, 100)
Out[26]:
[0.0,
 0.023333333333333334,
 0.05,
 0.21333333333333335,
 0.11333333333333333,
 0.15,
 0.17666666666666667,
 0.09333333333333334,
 0.09666666666666666,
 0.08333333333333333]
In [27]:
LA.matrix_power(markov_matrix_with_itself, 10)
Out[27]:
array([[ 0.15796624,  0.13162897,  0.18425894,  0.10517952,  0.1052864 ,
         0.15786611,  0.15781383],
       [ 0.15795476,  0.13162236,  0.18425508,  0.10519071,  0.10528765,
         0.15786676,  0.15782268],
       [ 0.15793623,  0.13161077,  0.18424984,  0.10520921,  0.10529296,
         0.15786411,  0.15783688],
       [ 0.15776928,  0.13148839,  0.18411611,  0.10541481,  0.10520923,
         0.15795596,  0.15804622],
       [ 0.15792959,  0.13160956,  0.18426268,  0.10520923,  0.10533039,
         0.1578301 ,  0.15782845],
       [ 0.15786611,  0.13155563,  0.18417479,  0.10530397,  0.10522007,
         0.15793716,  0.15794226],
       [ 0.15781383,  0.1315189 ,  0.18414303,  0.10536415,  0.10521897,
         0.15794226,  0.15799886]])
In [28]:
analytical_solution_of_diffusion(HG.nodes(), HG.hyper_edges(), True)
Out[28]:
[0.0,
 0.047619047619047616,
 0.047619047619047616,
 0.19047619047619047,
 0.14285714285714285,
 0.14285714285714285,
 0.14285714285714285,
 0.09523809523809523,
 0.09523809523809523,
 0.09523809523809523]
In [29]:
simulate_diffusion(HG, markov_matrix_with_itself, 100, 1,  True)
Out[29]:
[0.0,
 0.06666666666666667,
 0.04,
 0.16,
 0.18,
 0.13333333333333333,
 0.14,
 0.08666666666666667,
 0.10666666666666667,
 0.08666666666666667]
In [30]:
def diffusion_on_clique(nodes, hyper_edges, t_max, plot_results=False):
    clique_graph = converters.convert_to_clique_graph(nodes, hyper_edges)
    clique_markov_matrix = create_markov_matrix(clique_graph.edges(), count_itself=False)

    current_state = 1
    engine = DiffusionEngine(clique_markov_matrix)
    most_common, states = engine.simulate(t_max)
    most_common_nodes = count_nodes(clique_graph.nodes(), clique_graph.edges(),
                                    most_common)
    
    if plot_results:
        plt.figure(figsize=(8, 4))
        utils.plot_hyperedges_frequencies(most_common, clique_graph.edges(),
                                'Ocurrences of edges in a graph')
    
        plt.figure(figsize=(8, 4))
        return utils.plot_nodes_frequencies(most_common_nodes, 'Nodes in a graph')
    else:
        return utils.get_names_and_occurrences(most_common_nodes)[1]
    
    
In [31]:
diffusion_on_clique(HG.nodes(), HG.hyper_edges(), t_max=1000)
Out[31]:
[0.0, 0.035, 0.047, 0.191, 0.172, 0.1255, 0.154, 0.095, 0.0895, 0.091]
In [32]:
from scipy import stats

def analytical_clique_diffusion(hypergraph, plot_results=False):
    xs, ys = zip(*hypergraph.degree().items())
    sum_of_ys = sum(ys)
    ys = [float(y) / sum_of_ys for y in ys]
    
    if plot_results:
        plt.bar(xs, ys)
        plt.title("Analytical prediction of diffusion on clique")
    return ys


def correct_zero(node):
    if node == 0:
        node += 0.00001
    return node


def compare_to_theory(experimental, theoretical_1, theoretical_2):
    experimental = [correct_zero(node) for node in experimental]
    theoretical_1 = [correct_zero(node) for node in theoretical_1]
    theoretical_2 = [correct_zero(node) for node in theoretical_2]
    
    return stats.chisquare(experimental, f_exp=[theoretical_1, theoretical_2], axis=1)
    

def comparing_pipeline(t_max=100000):
    for n in range(5, 6, 5):
        for k in range(3, 4):
            for f in range(80, 81, 10):
                f = float(f) / 100
                # number_of_nodes, cardinality, fraction_of_hyperedges
                print(n, k, f)
                HG = create_graph(n, k, f)
                number_of_nodes = analytical_solution_of_diffusion(HG.nodes(), HG.hyper_edges(), plot_results=False)
                number_of_nodes_clique = analytical_clique_diffusion(HG, plot_results=False)
                markov_matrix_with_itself = create_markov_matrix(HG.hyper_edges(), count_itself=True)
                markov_matrix = create_markov_matrix(HG.hyper_edges(), count_itself=False)
                
                simulated_n_o_n = simulate_diffusion(HG, markov_matrix, t_max, plot_results=False)
                simulated_n_o_n_i = simulate_diffusion(HG, markov_matrix_with_itself, t_max)
                
                simulated_n_o_n_c = diffusion_on_clique(HG.nodes(), HG.hyper_edges(), t_max=t_max)
                
                print(compare_to_theory(simulated_n_o_n, number_of_nodes, number_of_nodes_clique))
                print(compare_to_theory(simulated_n_o_n_i, number_of_nodes, number_of_nodes_clique))
                print(compare_to_theory(simulated_n_o_n_c, number_of_nodes, number_of_nodes_clique))
                
                print(simulated_n_o_n)
                print(simulated_n_o_n_i)
                print(simulated_n_o_n_c)
                plt.figure(figsize=(12, 10))
                
                width = 0.15  
                plt.bar(HG.nodes(), simulated_n_o_n,
                            width=width, color='crimson',
                            label='Simulated markov hypergraph')
                
                plt.bar(np.array(HG.nodes()) + width, simulated_n_o_n_i, width,
                            color='burlywood',
                            label='Simulated markov hypergraph with loops')
                
                plt.bar(np.array(HG.nodes()) + 2 * width, number_of_nodes, width,
                            label='Analytical diffusion model on hypergraph',
                            color="#65df25")
                
                plt.bar(np.array(HG.nodes()) + 3 * width, simulated_n_o_n_c, width,
                            label='Simulated clique graph')

                plt.bar(np.array(HG.nodes()) + 4 * width, number_of_nodes_clique, width,
                            label='Analytical diffusion model on clique', color="#dcab11")
                
                plt.legend(loc=0)
In [33]:
comparing_pipeline()
5 3 0.8
(array([ 0.00473559,  0.06387746]), array([ 0.9999972 ,  0.99950069]))
(array([ 0.00083584,  0.04284002]), array([ 0.99999991,  0.99977384]))
(array([ 0.02596636,  0.00471509]), array([ 0.99991644,  0.99999723]))
[0.16686333333333334, 0.24992, 0.24988333333333335, 0.26720333333333335, 0.06613]
[0.15691333333333332, 0.25465333333333334, 0.2551, 0.25215666666666664, 0.08117666666666666]
[0.185145, 0.26233, 0.26336, 0.184525, 0.10464]
In [ ]: