-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathindexes-from-dumps.py
84 lines (66 loc) · 2.71 KB
/
indexes-from-dumps.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/usr/bin/env python3
import argparse
from pickle import Pickler
from collections import defaultdict
from wikidata import process_wikidata_dump, maybe_entity_value
from wikidata import PROPERTY_SUBCLASS_OF, PROPERTY_INSTANCE_OF
def transitive_closure(relation):
relation = dict(relation)
changed = True
while changed:
changed = False
step = defaultdict(set)
for item, successors in relation.items():
step[item] = successors.copy()
for successor in successors:
try:
step[item] |= relation[successor]
except KeyError:
# successor has no further successors, don't need to close
pass
if step[item] - relation[item]:
changed = True
if changed:
relation = dict(step)
return relation
def direct_relations_from_dump(dump, language='en'):
labels = {}
instances = defaultdict(set)
subclasses = defaultdict(set)
for entity in process_wikidata_dump(dump):
eid = entity['id']
if 'labels' in entity:
if language in entity['labels']:
labels[eid] = entity['labels'][language]['value']
if PROPERTY_SUBCLASS_OF in entity['claims']:
for claim in entity['claims'][PROPERTY_SUBCLASS_OF]:
superclass = maybe_entity_value(claim)
if superclass:
subclasses[eid] |= {superclass}
if PROPERTY_INSTANCE_OF in entity['claims']:
for claim in entity['claims'][PROPERTY_INSTANCE_OF]:
klass = maybe_entity_value(claim)
if klass:
instances[eid] |= {klass}
# now compute the transitive closure
return labels, instances, subclasses
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='extract a helper indexes ' +
'file from a Wikidata dump')
parser.add_argument('dump',
help='path to Wikidata dump file')
parser.add_argument('output',
help='path to output context file')
parser.add_argument('--language',
metavar='Lang', default='en',
help='include labels in language Lang')
args = parser.parse_args()
labels, instances, subclasses = direct_relations_from_dump(
args.dump, language=args.language)
transitive_subclasses = transitive_closure(subclasses)
with open(args.output, 'wb') as outfile:
pickle = Pickler(outfile)
pickle.dump({'labels': labels,
'instances': instances,
'subclasses': transitive_subclasses,
})