Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

OSM-POI: Include brand property [DRAFT] #69

Draft
wants to merge 43 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 34 commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
86ef2c5
add name-suggestion-index download
IritaSee Jan 3, 2022
1c62f71
add repo checking
IritaSee Jan 3, 2022
b74d373
[undone] interating over json
IritaSee Jan 3, 2022
d6fb766
add brand_name_downloader
IritaSee Jan 5, 2022
ac14e90
add name downloader to main, add operator naming
IritaSee Jan 5, 2022
f5d401f
rename to more suitable function names
IritaSee Jan 6, 2022
600c3ef
add staticmethod for download_names
IritaSee Jan 6, 2022
017b99d
add staticmethod to download_names
IritaSee Jan 6, 2022
96ae427
fix variable names to add context
IritaSee Jan 6, 2022
69ac660
fix variable typo
IritaSee Jan 7, 2022
d2c2460
add debug venv folder to ignore
IritaSee Jan 7, 2022
f173ec5
change - to None as default value
IritaSee Jan 7, 2022
2ff61be
add operator:wikidata
IritaSee Jan 7, 2022
3c93476
add func to match brands and operators, then add to spark
IritaSee Jan 7, 2022
2f3896f
fix algorithm
IritaSee Jan 8, 2022
a572b30
remove unused code
IritaSee Jan 8, 2022
d2467a7
update fuzzywuzzy to thefuzz in osm-poi related
IritaSee Jan 8, 2022
e74f64b
fix nan processing
IritaSee Jan 8, 2022
1290708
fix: change search to brand and operator
IritaSee Jan 10, 2022
6230a6b
recreate matching function
IritaSee Jan 11, 2022
6519b76
apply withcolumn in main matching function
IritaSee Jan 11, 2022
e78b8f7
fix typo
IritaSee Jan 12, 2022
3e16563
remove is_operator
IritaSee Jan 13, 2022
027f2d6
join operator and brand name matching function
IritaSee Jan 13, 2022
73f719e
remove duplicate name/operator
IritaSee Jan 13, 2022
310e06c
rework function to simply match names and input
IritaSee Jan 13, 2022
a91e266
fix error
IritaSee Jan 13, 2022
32e767f
Merge branch 'master' into feature/include-brand-property
IritaSee Jan 15, 2022
848f984
add brand_matched operator_matched name_matched
IritaSee Jan 20, 2022
d101e2d
fix run_cli convert add extra cd
IritaSee Jan 22, 2022
724ac03
readjust dowloader to new temp folder
IritaSee Jan 22, 2022
c2bb65c
add default statement
IritaSee Jan 24, 2022
898945b
update temp dir
IritaSee Jan 24, 2022
f08ea40
add downloading message
IritaSee Jan 24, 2022
1c153fb
add empty as return
IritaSee Jan 27, 2022
1c4b9dc
fix missleadnig var name
IritaSee Jan 27, 2022
2489e94
revert irrelevant change to this branch
IritaSee Jan 27, 2022
c01806f
code cleanup
IritaSee Jan 27, 2022
a149f1b
delete reference repo, rename reference file
IritaSee Jan 27, 2022
f4e651b
add id sorting for reference file, change print to log
IritaSee Jan 27, 2022
fae5761
remove reference repo, ignore reference file
IritaSee Jan 27, 2022
70f7255
Merge branch 'master' into feature/include-brand-property
Feb 2, 2022
0661a62
Resolve formatting and linting errors; Remove name matching UDF from …
Feb 3, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,5 @@ env
!.env.local
__pycache__


debug
kuwala/scripts/windows/
2 changes: 1 addition & 1 deletion kuwala/common/python_utils/src/FileSelector.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import pycountry
import pycountry_convert as pcc
import requests.exceptions
from fuzzywuzzy import fuzz
from thefuzz import fuzz
from pyquery import PyQuery
from hdx.data.dataset import Dataset
from hdx.data.organization import Organization
Expand Down
2 changes: 1 addition & 1 deletion kuwala/common/python_utils/src/spark_udfs.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import h3
import json
from fuzzywuzzy import fuzz
from thefuzz import fuzz
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, DoubleType, FloatType, IntegerType, StringType, StructField, StructType
from shapely.geometry import shape
Expand Down
2 changes: 1 addition & 1 deletion kuwala/core/cli/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ docopt==0.6.2
email-validator==1.1.3
et-xmlfile==1.1.0
exchangerates==0.3.4
fuzzywuzzy==0.18.0
thefuzz==0.19.0
greenlet==1.1.1
hdx-python-api==5.2.4
hdx-python-country==2.9.5
Expand Down
5 changes: 4 additions & 1 deletion kuwala/core/database/importer/sql/create_tables.sql
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ CREATE TABLE IF NOT EXISTS osm_poi (
latitude decimal NOT NULL,
longitude decimal NOT NULL,
name text,
name_matched text,
tags text[],
categories text[],
address_house_nr text,
Expand All @@ -69,7 +70,9 @@ CREATE TABLE IF NOT EXISTS osm_poi (
email text,
website text,
brand text,
brand_matched text,
operator text,
operator_matched text,
boundary text,
admin_level smallint,
type text,
Expand Down Expand Up @@ -148,7 +151,7 @@ CREATE TABLE IF NOT EXISTS google_osm_poi_matching (
CONSTRAINT fk_google_osm_poi_matching_osm_id FOREIGN KEY(osm_type, osm_id) REFERENCES osm_poi(osm_type, osm_id)
);

-- Creation of google_osm_poi_matching table
-- Creation of google_custom_poi_matching table

CREATE TABLE IF NOT EXISTS google_custom_poi_matching (
custom_id text NOT NULL PRIMARY KEY,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from pyspark.sql.functions import lit


def import_population_density(spark, database_url, database_properties, continent, country, population_density_date):
def import_population_density(spark, database_url, database_properties, continent, country, population_density_date=''):
start_time = time.time()

logging.info(f'Starting import of population density data for {country}, {continent}')
Expand Down
2 changes: 1 addition & 1 deletion kuwala/pipelines/osm-poi/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ docopt==0.6.2
email-validator==1.1.3
et-xmlfile==1.1.0
exchangerates==0.3.4
fuzzywuzzy==0.18.0
thefuzz==0.19.0
greenlet==1.1.1
h3==3.7.3
hdx-python-api==5.2.4
Expand Down
50 changes: 48 additions & 2 deletions kuwala/pipelines/osm-poi/src/Downloader.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
import os
from python_utils.src.FileDownloader import download_file
from python_utils.src.FileSelector import select_osm_file

import urllib.request as req
import zipfile
import json
import pandas as pd

class Downloader:
@staticmethod
def start(args):
def download_pbf(args):
file = None

if args.url is None:
Expand All @@ -23,3 +26,46 @@ def start(args):
file_path += '/pbf/geo_fabrik.osm.pbf'

download_file(url=args.url or file['url'], path=file_path)

IritaSee marked this conversation as resolved.
Show resolved Hide resolved
@staticmethod
def download_names():
temp_files_dir='../../../tmp/kuwala/osm_files/'
# here, instead of cloning the repository that recommended using extra library,
# we download the whole repo in zip, then extract it.
if not os.path.exists(temp_files_dir+'name-suggestion-index-main'):
print("Downloading brand and operator name reference...")
download_link='https://github.com/osmlab/name-suggestion-index/archive/refs/heads/main.zip'
req.urlretrieve(download_link, temp_files_dir+"main.zip")
with zipfile.ZipFile(temp_files_dir+'main.zip', 'r') as zip_ref:
zip_ref.extractall(temp_files_dir)
os.remove(temp_files_dir+'main.zip')

file_paths=[temp_files_dir+'name-suggestion-index-main/data/brands',temp_files_dir+'name-suggestion-index-main/data/operators']
data = {'id': [], 'display_name': [], 'wiki_data': []}
print("Composing brand and operator name list...")
for file_path in file_paths:
for folder in os.listdir(file_path):
if os.path.isdir(os.path.join(file_path,folder)):
for file in os.listdir(os.path.join(file_path,folder)):
with open(os.path.join(file_path,folder,file)) as f:
file_content=json.load(f)
for item in file_content['items'] :
wiki_data=id=display_name=None
if ('id' in item.keys()):
id=(dict(item)['id'])
if ('displayName' in item.keys()):
display_name=(dict(item)['displayName'])
if ("tags" in item.keys()):
if ('brand:wikidata' in list(item['tags'].keys())):
wiki_data=(dict(item["tags"].items())['brand:wikidata'])
elif ('operator:wikidata' in list(item['tags'].keys())):
wiki_data=(dict(item["tags"].items())['operator:wikidata'])

data['id'].append(id)
data['display_name'].append(display_name)
data['wiki_data'].append(wiki_data)

df=pd.DataFrame(data)
df.drop_duplicates(subset=['display_name','wiki_data'])
df.to_csv(temp_files_dir+'names.csv',index=False)
print("Done!")
36 changes: 35 additions & 1 deletion kuwala/pipelines/osm-poi/src/Processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
from pyspark.sql.types import \
ArrayType, BooleanType, FloatType, IntegerType, NullType, StringType, StructField, StructType
from python_utils.src.FileSelector import select_local_osm_file
from python_utils.src.spark_udfs import create_geo_json_based_on_coordinates, get_centroid_of_geo_json, get_h3_index
from python_utils.src.spark_udfs import create_geo_json_based_on_coordinates, get_centroid_of_geo_json, get_h3_index, get_string_distance
import pandas as pd

DEFAULT_RESOLUTION = 15

Expand Down Expand Up @@ -276,6 +277,37 @@ def combine_pois(df_node, df_way, df_relation) -> DataFrame:
df_relation = df_relation.filter(col('is_poi') & col('h3_index').isNotNull()).select(columns)

return df_node.union(df_way).union(df_relation)

# search brands and oprerator from names.csv and put that into pyspark dataframe

@staticmethod
def name_matching(script_dir,spark, df_pois) -> DataFrame:
names=pd.read_csv(os.path.join(script_dir, '../tmp/names.csv'))['display_name'].tolist()

names=spark.sparkContext.broadcast(names)

@udf(returnType=StringType())
def brand_and_operator_name_matching(df_pois):
IritaSee marked this conversation as resolved.
Show resolved Hide resolved
similar_name_score=-1;best_match=None

#Check if the input is empty
if(str(df_pois)=='nan'):
return best_match

#name matching
for name in names:
distance=get_string_distance(df_pois, name)
if(distance>similar_name_score):
similar_name_score=distance
best_match=name

return best_match

return df_pois \
.withColumn('brand_matched', brand_and_operator_name_matching(col('brand'))) \
.withColumn('operator_matched', brand_and_operator_name_matching(col('operator'))).withColumn('name_matched', brand_and_operator_name_matching(col('name')))



@staticmethod
def start(args):
Expand Down Expand Up @@ -337,6 +369,8 @@ def has_polygon_shape(members):
# Combine all data frames
df_pois = Processor.combine_pois(df_node, df_way, df_relation)

df_pois = Processor.name_matching(script_dir, spark, df_pois)

df_pois.write.mode('overwrite').parquet(file_path + '/parquet/kuwala.parquet')

end_time = time.time()
Expand Down
3 changes: 2 additions & 1 deletion kuwala/pipelines/osm-poi/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
action = 'download' if option == choices[0] else 'process'

if action == 'download':
Downloader.start(args)
Downloader.download_pbf(args)
Downloader.download_names()
else:
Processor.start(args)
14 changes: 11 additions & 3 deletions kuwala/scripts/initialize_windows.sh
Original file line number Diff line number Diff line change
@@ -1,12 +1,20 @@
mkdir windows
sed 's/\r$//' build_all_containers.sh > ./windows/build_all_containers.sh
sed 's/\r$//' initialize_all_components.sh > ./windows/initialize_all_components.sh
sed 's/\r$//' build_cli.sh > ./windows/build_cli.sh
sed -i '1 a cd ..' ./windows/build_cli.sh
sed 's/\r$//' build_jupyter_notebook.sh > ./windows/build_jupyter_notebook.sh
sed 's/\r$//' build_postgres.sh > ./windows/build_postgres.sh
sed -i '1 a cd ..' ./windows/build_jupyter_notebook.sh
sed 's/\r$//' create_zip_archive.sh > ./windows/create_zip_archive.sh
sed 's/\r$//' initialize_all_components.sh > ./windows/initialize_all_components.sh
sed -i '1 a cd ..' ./windows/create_zip_archive.sh
sed 's/\r$//' initialize_core_components.sh > ./windows/initialize_core_components.sh
sed 's/\r$//' initialize_git_submodules.sh > ./windows/initialize_git_submodules.sh
sed -i '1 a cd ..' ./windows/initialize_git_submodules.sh
sed 's/\r$//' run_cli.sh > ./windows/run_cli.sh
sed -i '1 a cd ..' ./windows/run_cli.sh
sed 's/\r$//' run_jupyter_notebook.sh > ./windows/run_jupyter_notebook.sh
sed -i '1 a cd ..' ./windows/run_jupyter_notebook.sh
sed 's/\r$//' stop_all_containers.sh > ./windows/stop_all_containers.sh
sed 's/\r$//' build_all_containers.sh > ./windows/build_all_containers.sh
sed -i '1 a cd ..' ./windows/build_all_containers.sh
sed 's/\r$//' build_postgres.sh > ./windows/build_postgres.sh
sed -i '1 a cd ..' ./windows/build_postgres.sh
IritaSee marked this conversation as resolved.
Show resolved Hide resolved