from github import Github
import networkx as nx
# Replace this to your own GitHub API key
TOKEN = ''
######## 定数の定義 ##########
# Edge Types (interaction)
COMMIT_TO = 'commit_to'
OWNER_OF = 'owner_of'
# Node Types
USER = 'user'
REPO = 'repo'
SEED = 'seed'
SEED_COMMITTER = 'seed_committer'
# レポジトリを追加するかどうか判断するための閾値
STAR_TH = 100
# グラフのSeedとなるのレポジトリ情報
SEEDS = [
'mbostock/d3',
'nnnick/Chart.js',
'trifacta/vega',
'misoproject/d3.chart',
'novus/nvd3',
'simplegeo/polymaps',
'lmccart/p5.js'
# 'ContinuumIO/bokeh',
# 'matplotlib/matplotlib',
# 'DmitryBaranovskiy/raphael/',
# 'processing/processing'
]
client = Github(TOKEN, per_page=100)
# 最終的に可視化するネットワークデータ
g = nx.MultiDiGraph(name='Github Universe')
# Extract committers of the project
def extract_committers(repo, repo_node, expand_user_repos, user_type):
committers = repo.get_contributors()
for committer in committers:
login = committer.login
g.add_node(login, type = user_type)
extract_user_info(g, login, committer)
g.add_edge(login, repo_node, interaction = COMMIT_TO)
if expand_user_repos:
committer_repos = committer.get_repos()
add_user_repos(login, committer_repos)
# Filter major projects
def add_user_repos(user, repos):
for repo in repos:
# Pick only highly starred projects
stargazers = repo.stargazers_count
if stargazers < STAR_TH:
continue
repo_name = repo.full_name
if g.has_node(repo_name) == False:
g.add_node(repo_name, type=REPO)
extract_repo_info(g, repo_name, repo)
g.add_edge(user, repo_name, interaction = OWNER_OF)
extract_committers(repo, repo_name, False, USER)
# Extract user information
def extract_user_info(graph, user_id, user):
node = graph.node[user_id]
name = user.name
if name is None:
name = user_id
node['login'] = user_id
node['name'] = name
node['followers'] = user.followers
node['location'] = user.location
node['bio'] = user.bio
node['score'] = node['followers']
# Extract repository information
def extract_repo_info(graph, repo_id, repo):
node = graph.node[repo_id]
node['name'] = repo.name
node['description'] = repo.description
node['homepage'] = repo.homepage
node['stargazers'] = repo.stargazers_count
node['watchers'] = repo.watchers_count
node['fork_count'] = repo.forks_count
node['score'] = node['watchers'] + node['stargazers']
node['language'] = repo.language
とても時間がかかります...
for seed in SEEDS:
repo = client.get_repo(seed)
repo_name = repo.full_name
g.add_node(repo_name, type=SEED)
extract_repo_info(g, repo_name, repo)
extract_committers(repo, repo_name, True, SEED_COMMITTER)
現在のNetworkXのGraphML書き出し関数はNoneをサポートしていないので、空の文字列に置き換えます。
print(g.number_of_nodes())
print(g.number_of_edges())
# Replace None to empty string
nodes = g.nodes()
for node_id in nodes:
node = g.node[node_id]
keys = node.keys()
for key in keys:
if node[key] is None:
node[key] = ''
1423 1963
Cytoscapeでの扱いが楽なので、ここではGraphML形式で書き出します。
nx.write_graphml(g, 'github_universe.graphml')