from datetime import datetime
from pprint import pprint
from joblib import Memory
from sklearn.decomposition import randomized_svd
from urllib.request import urlopen
# #############################################################################
# Where to download the data, if not already on disk
redirects_url = "http://downloads.dbpedia.org/3.5.1/en/redirects_en.nt.bz2"
redirects_filename = redirects_url.rsplit("/", 1)[1]
page_links_url = "http://downloads.dbpedia.org/3.5.1/en/page_links_en.nt.bz2"
page_links_filename = page_links_url.rsplit("/", 1)[1]
(redirects_url, redirects_filename),
(page_links_url, page_links_filename),
for url, filename in resources:
if not os.path.exists(filename):
print("Downloading data from '%s', please wait..." % url)
open(filename, 'wb').write(opener.read())
# #############################################################################
memory = Memory(cachedir=".")
def index(redirects, index_map, k):
return index_map.setdefault(k, len(index_map))
DBPEDIA_RESOURCE_PREFIX_LEN = len("http://dbpedia.org/resource/")
SHORTNAME_SLICE = slice(DBPEDIA_RESOURCE_PREFIX_LEN + 1, -1)
"""移除 < and > URI 記號以及普遍 URI 開頭"""
return nt_uri[SHORTNAME_SLICE]
def get_redirects(redirects_filename):
print("Parsing the NT redirect file")
for l, line in enumerate(BZ2File(redirects_filename)):
print("ignoring malformed line: " + line)
redirects[short_name(split[0])] = short_name(split[2])
print("[%s] line: %08d" % (datetime.now().isoformat(), l))
print("Computing the transitive closure of the redirect relation")
for l, source in enumerate(redirects.keys()):
target = redirects[source]
transitive_target = target
target = redirects.get(target)
if target is None or target in seen:
redirects[source] = transitive_target
print("[%s] line: %08d" % (datetime.now().isoformat(), l))
# disabling joblib as the pickling of large dicts seems much too slow
def get_adjacency_matrix(redirects_filename, page_links_filename, limit=None):
"""Extract the adjacency graph as a scipy sparse matrix
Redirects are resolved first.
Returns X, the scipy sparse adjacency matrix, redirects as python
dict from article names to article names and index_map a python dict
from article names to python int (article indexes).
print("Computing the redirect map")
redirects = get_redirects(redirects_filename)
print("Computing the integer index map")
for l, line in enumerate(BZ2File(page_links_filename)):
print("ignoring malformed line: " + line)
i = index(redirects, index_map, short_name(split[0]))
j = index(redirects, index_map, short_name(split[2]))
print("[%s] line: %08d" % (datetime.now().isoformat(), l))
if limit is not None and l >= limit - 1:
print("Computing the adjacency matrix")
X = sparse.lil_matrix((len(index_map), len(index_map)), dtype=np.float32)
print("Converting to CSR representation")
print("CSR conversion done")
return X, redirects, index_map
X, redirects, index_map = get_adjacency_matrix(
redirects_filename, page_links_filename, limit=5000000)
names = {i: name for name, i in index_map.items()}
print("Computing the principal singular vectors using randomized_svd")
U, s, V = randomized_svd(X, 5, n_iter=3)
print("done in %0.3fs" % (time() - t0))
print("Top wikipedia pages according to principal singular vectors")
pprint([names[i] for i in np.abs(U.T[0]).argsort()[-10:]])
pprint([names[i] for i in np.abs(V[0]).argsort()[-10:]])
def centrality_scores(X, alpha=0.85, max_iter=100, tol=1e-10):
"""Power iteration computation of the principal eigenvector
This method is also known as Google PageRank and the implementation
is based on the one from the NetworkX project (BSD licensed too)
incoming_counts = np.asarray(X.sum(axis=1)).ravel()
print("Normalizing the graph")
for i in incoming_counts.nonzero()[0]:
X.data[X.indptr[i]:X.indptr[i + 1]] *= 1.0 / incoming_counts[i]
dangle = np.asarray(np.where(np.isclose(X.sum(axis=1), 0),
scores = np.full(n, 1. / n, dtype=np.float32) # initial guess
for i in range(max_iter):
print("power iteration #%d" % i)
scores = (alpha * (scores * X + np.dot(dangle, prev_scores))
+ (1 - alpha) * prev_scores.sum() / n)
# check convergence: normalized l_inf norm
scores_max = np.abs(scores).max()
err = np.abs(scores - prev_scores).max() / scores_max
print("error: %0.6f" % err)
print("Computing principal eigenvector score using a power iteration method")
scores = centrality_scores(X, max_iter=100)
print("done in %0.3fs" % (time() - t0))
pprint([names[i] for i in np.abs(scores).argsort()[-10:]])