#! /usr/bin/env python3
# This Python file uses the following encoding: utf-8
__author__ = 'jgeiss'
#############################################################################
# authors: Johanna Geiß, Heidelberg University, Germany #
# email: geiss@informatik.uni-heidelberg.de #
# Copyright (c) 2017 Database Research Group, #
# Institute of Computer Science, #
# University of Heidelberg #
# Licensed under the Apache License, Version 2.0 (the "License"); #
# you may not use this file except in compliance with the License. #
# You may obtain a copy of the License at #
# #
# http://www.apache.org/licenses/LICENSE-2.0 #
# #
# Unless required by applicable law or agreed to in writing, software #
# distributed under the License is distributed on an "AS IS" BASIS, #
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.#
# See the License for the specific language governing permissions and #
# limitations under the License. #
#############################################################################
# 02.03.2017 #
# last updated 21.3.2017 by Johanna Geiß #
#############################################################################
# NECKAr: Named Entity Classifier for Wikidata #
# this tool categorizes Wikidata items into 6 categories #
# the parameters are set in NECKAr.cfg #
#############################################################################
### Importing modules
import sys
import configparser
from pymongo import *
from pymongo import errors
import NECKAr_get_functions as get_functions
from NECKAr_WikidataAPI import get_wikidata_item_tree_item_idsSPARQL
#from NECKAr_wikidata_processor import WikiDataProcessor
import NECKAr_write_functions as write_functions
[docs]def print_info(info):
print("INFO\tNECKAr:\t",info)
[docs]def read_config(config):
"""Reads the configuration file NECKAr.cfg
:param config: ConfigParser Object
:return: input and output collection
"""
host = config.get('Database', 'host')
port=config.getint('Database','port')
auth = config.getboolean('Database','auth')
user=config.get('Database','user')
password=config.get('Database','password')
try:
client=MongoClient(host,port)
except errors.ConnectionFailure:
print("Connection to the database cannot be made. Plase check the config file")
db_read_name=config.get('Database','db_dump')
db_write_name=config.get('Database','db_write')
db_in=client[db_read_name] #Database from where information is extracted
db_out=client[db_write_name] #Database in which information is written
if auth:
db_out.authenticate(user,password)
input_collection_name=config.get('Database','collection_dump')
input_collection=db_in[input_collection_name]
output_collection_name=config.get('Database','collection_write')
output_collection=db_out[output_collection_name]
print("Config File read in")
return input_collection, output_collection
[docs]def find_persons(output_collection, input_collection):
"""Finds person in Wikidata dump and stores them together with additional information in the output collection
:param output_collection:
:param input_collection:
:return: nothing, writes objects directly to MongoDB
"""
per_insert=0
insert_count = 0
bulk = output_collection.initialize_unordered_bulk_op()
print_info("Find persons...")
print_info("Remove all persons entries")
output_collection.remove({"neClass": "PER"})
print_info("--- DONE")
person_cursor=input_collection.find({"$and":[{"type":"item"},{"claims.P31.mainsnak.datavalue.value.numeric-id":5}]})
print_info("Begining of person-loop")
for item in person_cursor:
entry=write_functions.write_common_fields(item)
entry["neClass"]="PER"
#date of birth
dob = get_functions.get_datebirth(item)
if dob:
entry["date_birth"]=dob
#date of death
dod = get_functions.get_datedeath(item)
if dod:
entry["date_death"]=dod
#gender
gender = get_functions.get_gender(item)
if gender:
entry["gender"]=gender
#occupation
occupation = get_functions.get_occupation(item)
if len(occupation)>0:
entry["occupation"]=occupation
#aliases, alternative names
alias = get_functions.get_alias_list(item)
if len(alias)>0:
entry["alias"]= alias
wdid=entry["id"]
doc=output_collection.find_one({"id": wdid})
if (not doc or doc["neClass"]!="PER"):
insert_count += 1
bulk.insert(entry)
if insert_count == 1000:
per_insert+=1
print_info("PER " + str(per_insert*1000)+"persons written")
sys.stdout.flush()
bulk.execute()
bulk= output_collection.initialize_unordered_bulk_op()
insert_count = 0
if insert_count > 0:
bulk.execute()
[docs]def find_locations(output_collection, input_collection):
"""
Finds locations in Wikidata dump and stores them together with additional information in the output collection
:param output_collection:
:param input_collection:
:return: nothing, writes objects directly to MongoDB
"""
#Location Specific
loc_insert=0
insert_count = 0
bulk = output_collection.initialize_unordered_bulk_op()
output_collection.remove({"neClass": "LOC"})
print_info("LOC\tremoved old locations")
geolocation_subclass = get_wikidata_item_tree_item_idsSPARQL([2221906], backward_properties=[279])
food_subclass = get_wikidata_item_tree_item_idsSPARQL([2095], backward_properties=[279])
geolocation_subclass = list(set(geolocation_subclass)-set(food_subclass))
print_info("LOC\t"+str(len(geolocation_subclass))+str(type(geolocation_subclass)))
settlement_subclass = get_wikidata_item_tree_item_idsSPARQL([486972], backward_properties=[279])
country_subclass = get_wikidata_item_tree_item_idsSPARQL([6256], backward_properties=[279])
sovereignstate_subclass = get_wikidata_item_tree_item_idsSPARQL([3624078], backward_properties=[279])
ccountry_subclass = get_wikidata_item_tree_item_idsSPARQL([1763527], backward_properties=[279])
country_subclass += sovereignstate_subclass + ccountry_subclass
sea_subclass = get_wikidata_item_tree_item_idsSPARQL([165], backward_properties=[279])
state_subclass = get_wikidata_item_tree_item_idsSPARQL([7275], backward_properties=[279])
city_subclass = get_wikidata_item_tree_item_idsSPARQL([515], backward_properties=[279])
river_subclass = get_wikidata_item_tree_item_idsSPARQL([4022], backward_properties=[279])
mountain_subclass= get_wikidata_item_tree_item_idsSPARQL([8502], backward_properties=[279])
mountainr_subclass= get_wikidata_item_tree_item_idsSPARQL([1437459], backward_properties=[279])
#POI_subclass= WikiDataProcessor.get_wikidata_item_tree_item_idsSPARQL([XXX], backward_properties=[279])
hgte_subclass= get_wikidata_item_tree_item_idsSPARQL([15642541], backward_properties=[279])
print_info("LOC\tLocation subclasses found")
location_cursor = input_collection.find({"$and":[\
{"type":"item"},\
{"claims.P31.mainsnak.datavalue.value.numeric-id":{"$in":geolocation_subclass}}]\
},no_cursor_timeout=True)
print_info("LOC\tLocations found")
print_info("LOC\tBegining of location-loop")
for item in location_cursor:
entry=write_functions.write_common_fields(item)
entry["neClass"]="LOC"
(incountry,incontinent) = get_functions.get_location_inside(item)
if len(incountry) != 0:
entry["in_country"] = incountry
if len(incontinent) != 0:
entry["in_continent"] = incountry
loc_type = get_functions.get_poi(item, country_subclass,settlement_subclass, city_subclass, sea_subclass, river_subclass, mountain_subclass, mountainr_subclass, state_subclass,hgte_subclass)
if len(loc_type) != 0:
entry["location_type"] = loc_type
coordinate = get_functions.get_coordinate(item)
if coordinate:
#{ type: "Point", coordinates: [ 40, 5 ] }
entry["coordinate"]=coordinate
population = get_functions.get_population(item)
if population:
entry["population"]=population
# is part of LOD Link list
#GN_ID = get_functions.get_geonamesID(item)
#if GN_ID:
# entry["geonamesID"] = GN_ID
wdid=entry["id"]
doc=output_collection.find_one({"id": wdid})
if (not doc or doc["neClass"]!="LOC"):
insert_count += 1
bulk.insert(entry)
if insert_count == 1000:
loc_insert+=1
print_info("LOC\t"+str(loc_insert*1000)+"locations written")
sys.stdout.flush()
try:
bulk.execute()
except errors.BulkWriteError as bwe:
print(bwe.details)
#you can also take this component and do more analysis
#werrors = bwe.details['writeErrors']
raise
bulk= output_collection.initialize_unordered_bulk_op()
insert_count = 0
location_cursor.close()
if insert_count > 0:
bulk.execute()
[docs]def find_organizations(output_collection, input_collection):
"""
Finds organozations in Wikidata dump and stores them together with additional information in the output collection
:param output_collection:
:param input_collection:
:return: nothing, writes objects directly to MongoDB
"""
#Organization Specific
org_insert=0
insert_count = 0
bulk = output_collection.initialize_unordered_bulk_op()
print_info("Begining of Organization loop")
output_collection.remove({"neClass": "ORG"})
print_info("removed old orgs")
organization_subclass=get_wikidata_item_tree_item_idsSPARQL([43229], backward_properties=[279])
#print(len(organization_subclass))
organization_cursor = input_collection.find({"$and":[{"type": "item"},
{"claims.P31.mainsnak.datavalue.value.numeric-id":{"$in":organization_subclass}}]})
for item in organization_cursor:
entry=write_functions.write_common_fields(item)
entry["neClass"]="ORG"
olang = get_functions.get_official_language(item)
if len(olang) != 0:
entry["official_language"] = olang
inception = get_functions.get_inception(item)
if inception:
entry["inception"] = inception
hq = get_functions.get_hq_location(item)
if hq:
entry["hq_location"]= hq
web = get_functions.get_official_website(item)
if web:
entry["official_website"]=web
founder = get_functions.get_founder(item)
if len(founder) != 0:
entry["founder"]=founder
ceo = get_functions.get_ceo(item)
if len(ceo) != 0:
entry["ceo"]=ceo
country_org = get_functions.get_country(item)
if len(country_org) != 0:
entry["country"]=country_org
instanceof= get_functions.get_instance_of(item)
if len(instanceof) != 0:
entry["instance_of"]=instanceof
wdid=entry["id"]
doc=output_collection.find_one({"id": wdid})
if (not doc or doc["neClass"]!="ORG"):
insert_count += 1
bulk.insert(entry)
if insert_count == 1000:
org_insert+=1
print_info("ORG "+str(org_insert*1000)+ " organizations written")
sys.stdout.flush()
bulk.execute()
bulk= output_collection.initialize_unordered_bulk_op()
insert_count = 0
if insert_count > 0:
bulk.execute()
########################################################################################################################
if __name__ == "__main__":
"""NECKAr: Named Entity Classifier for Wikidata
this tool categorizes Wikidata items into 6 categories
the parameters are set in NECKAr.cfg """
config = configparser.ConfigParser()
config.read('../NECKAr.cfg')
input_collection, output_collection = read_config(config)
output_collection.create_index([('id', ASCENDING)])
if config.getboolean('Search_Flags','person'):
find_persons(output_collection,input_collection)
if config.getboolean('Search_Flags','location'):
find_locations(output_collection, input_collection)
if config.getboolean('Search_Flags','organization'):
find_organizations(output_collection, input_collection)