Both the outlink and inlink distributions have the vast majority of their mass near zero and decay very rapidly.
import statsmodels.api as sm
from scipy.io import loadmat
from pandas import Series
from numpy import array
A = loadmat('A.mat')['A'].astype('int32')
outlinks = Series(array(A.sum(1).flatten())[0])
inlinks = Series(array(A.sum(0).flatten())[0])
outlinks.describe()
count 185314.000000 mean 13.126801 std 46.984216 min 0.000000 25% 1.000000 50% 1.000000 75% 3.000000 max 2799.000000 dtype: float64
outlinks.hist(bins=20);
log(outlinks+1).hist(bins=20, log=True)
outlinks.hist(normed=True, cumulative=True, bins=30, histtype='step')
ylim(.94,1)
(0.94, 1)
ecdf = sm.distributions.ECDF(outlinks)
x = linspace(min(outlinks), max(outlinks))
y = ecdf(x)
step(log(x+1), y)
ylim(.9,1);
Same story, but even more extreme.
inlinks.describe()
count 185314.000000 mean 13.126801 std 138.280052 min 0.000000 25% 0.000000 50% 0.000000 75% 2.000000 max 46769.000000 dtype: float64
inlinks.hist(bins=20)
<matplotlib.axes.AxesSubplot at 0x1187fa890>
log(inlinks+1).hist(bins=20, log=True)
<matplotlib.axes.AxesSubplot at 0x116f0f110>
ecdf = sm.distributions.ECDF(inlinks)
x = linspace(min(inlinks), max(inlinks))
y = ecdf(x)
step(log(x+1), y)
ylim(.95, 1);