diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f83388b..4644974 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -6,7 +6,7 @@ jobs: build: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Install PostgreSQL run: | sudo apt-get update -qq diff --git a/README.md b/README.md index 1775824..2ab0dd8 100644 --- a/README.md +++ b/README.md @@ -21,15 +21,12 @@ in the results match the search terms). Wikipedia publishes [dumps](https://meta.wikimedia.org/wiki/Data_dumps) of their databases once per month. -To run one build you need 150GB of disc space (of which 90GB Postgresql database). The scripts process -39 languages and output 4 files. Runtime is approximately 9 hours on a 4 core, 4GB RAM machine with SSD +To run one build you need 150GB of disc space (of which 90GB is Postgresql database). The scripts process +39 languages and output one file. Runtime is approximately 9 hours on a 4 core, 4GB RAM machine with SSD discs. ``` -334M wikimedia_importance.csv.gz # the primary file -303M wikipedia_importance.sql.gz -216M wikipedia_article.csv.gz - 88M wikipedia_redirect.csv.gz +334M wikimedia_importance.tsv.gz ``` @@ -51,7 +48,7 @@ retries (wikidata API being unreliable) was added. ## Output data -`wikimedia_importance.csv.gz` contains about 17 million rows. Number of lines grew 2% between 2022 and 2023. +`wikimedia_importance.tsv.gz` contains about 17 million rows. Number of lines grew 2% between 2022 and 2023. The file tab delimited, not quoted, is sorted and contains a header row. | Column | Type | @@ -84,7 +81,7 @@ Currently 39 languages, English has by far the largest share. | ... | | | bg (Bulgarian) | 88,993 | -Examples of `wikimedia_importance.csv.gz` rows: +Examples of `wikimedia_importance.tsv.gz` rows: * Wikipedia contains redirects, so a single wikidata object can have multiple titles even though. Each title has the same importance score. Redirects to non-existing articles are removed. @@ -311,7 +308,7 @@ uncommon for an export starting Jan/1st to only be full ready Jan/10th or later. 9. output (0:15h) - Uses `pg_dump` tool to create SQL files. Uses SQL `COPY` command to create CSV files. + Uses `pg_dump` tool to create SQL files. Uses SQL `COPY` command to create TSV file. License diff --git a/steps/output.sh b/steps/output.sh index f28a01b..6e4fbe9 100755 --- a/steps/output.sh +++ b/steps/output.sh @@ -88,39 +88,12 @@ echo "WITH from_redirects AS ( -# "=====================================================================" -echo "Create indexes" -# "=====================================================================" - -echo "CREATE INDEX wikipedia_article_title_language_idx - ON wikipedia_article - (title, language) - ;" | psqlcmd -echo "CREATE INDEX wikipedia_article_wd_page_title_idx - ON wikipedia_article - (wd_page_title) - ;" | psqlcmd -echo "CREATE INDEX wikipedia_redirect_language_from_title_idx - ON wikipedia_redirect - (language, from_title) - ;" | psqlcmd - - # "=====================================================================" -echo "Dump tables" +echo "Dump table" # "=====================================================================" -echo "* wikipedia_importance.sql.gz" - -pg_dump -d $DATABASE_NAME --no-owner -t wikipedia_article -t wikipedia_redirect | \ - grep -v '^SET ' | \ - grep -v 'SELECT ' | \ - grep -v '\-\- ' | \ - sed 's/public\.//' | \ - pigz -9 > "$OUTPUT_PATH/wikipedia_importance.sql.gz" - # Temporary table for sorting the output by most popular language. Nominatim assigns # the wikipedia extra tag to the first language it finds during import and English (en) @@ -147,34 +120,23 @@ echo "CREATE TABLE top_languages AS -for TABLE in wikipedia_article wikipedia_redirect wikimedia_importance -do - echo "* $TABLE.csv.gz" - - SORTCOL="title" - if [[ "$TABLE" == "wikipedia_redirect" ]]; then - SORTCOL="from_title" - fi +echo "* wikimedia_importance.tsv.gz" - { - echo "COPY (SELECT * FROM $TABLE LIMIT 0) TO STDOUT WITH DELIMITER E'\t' CSV HEADER" | \ - psqlcmd - echo "COPY ( - SELECT w.* - FROM $TABLE w - JOIN top_languages tl ON w.language = tl.language - ORDER BY tl.size DESC, w.$SORTCOL - ) TO STDOUT" | \ - psqlcmd - } | pigz -9 > "$OUTPUT_PATH/$TABLE.csv.gz" +{ + echo "COPY (SELECT * FROM wikimedia_importance LIMIT 0) TO STDOUT WITH DELIMITER E'\t' CSV HEADER" | \ + psqlcmd + echo "COPY ( + SELECT w.* + FROM wikimedia_importance w + JOIN top_languages tl ON w.language = tl.language + ORDER BY tl.size DESC, w.title + ) TO STDOUT" | \ + psqlcmd +} | pigz -9 > "$OUTPUT_PATH/wikimedia_importance.tsv.gz" - # default is 600 - chmod 644 "$OUTPUT_PATH/$TABLE.csv.gz" -done +# default is 600 +chmod 644 "$OUTPUT_PATH/wikimedia_importance.tsv.gz" du -h $OUTPUT_PATH/* -# 220M wikipedia_article.csv.gz -# 87M wikipedia_redirect.csv.gz -# 305M wikipedia_importance.sql.gz -# 265M wikimedia_importance.csv.gz +# 265M wikimedia_importance.tsv.gz