diff --git a/ingest/ingest.smk b/ingest/ingest.smk index a86b89e..ebb6c4c 100644 --- a/ingest/ingest.smk +++ b/ingest/ingest.smk @@ -91,7 +91,7 @@ rule transform_metadata: """ ingest/scripts/tsv-to-ndjson.py < {input.metadata} | ingest/scripts/fix_country_field.py | - ingest/scripts/apply-geolocation-rules.py --geolocation-rules ingest/config/geoLocationRules.tsv | + ingest/vendored/apply-geolocation-rules --geolocation-rules ingest/config/geoLocationRules.tsv | ingest/scripts/add-year.py | ingest/scripts/ndjson-to-tsv.py --metadata-columns {params.metadata_columns} --metadata {output.metadata} """ diff --git a/ingest/vendored/README.md b/ingest/vendored/README.md new file mode 100644 index 0000000..db9ff4c --- /dev/null +++ b/ingest/vendored/README.md @@ -0,0 +1,37 @@ +# ingest + +Shared internal tooling for pathogen data ingest. Used by our individual +pathogen repos which produce Nextstrain builds. Expected to be vendored by +each pathogen repo using `git subtree` (or `git subrepo`). + +Some tools may only live here temporarily before finding a permanent home in +`augur curate` or Nextstrain CLI. Others may happily live out their days here. + +## History + +Much of this tooling originated in +[ncov-ingest](https://github.com/nextstrain/ncov-ingest) and was passaged thru +[monkeypox's ingest/](https://github.com/nextstrain/monkeypox/tree/@/ingest/). +It subsequently proliferated from [monkeypox][] to other pathogen repos +([rsv][], [zika][], [dengue][], [hepatitisB][], [forecasts-ncov][]) primarily +thru copying. To [counter that +proliferation](https://bedfordlab.slack.com/archives/C7SDVPBLZ/p1688577879947079), +this repo was made. + +[monkeypox]: https://github.com/nextstrain/monkeypox +[rsv]: https://github.com/nextstrain/rsv +[zika]: https://github.com/nextstrain/zika/pull/24 +[dengue]: https://github.com/nextstrain/dengue/pull/10 +[hepatitisB]: https://github.com/nextstrain/hepatitisB +[forecasts-ncov]: https://github.com/nextstrain/forecasts-ncov + +## Elsewhere + +The creation of this repo, in both the abstract and concrete, and the general +approach to "ingest" has been discussed in various internal places, including: + +- https://github.com/nextstrain/private/issues/59 +- @joverlee521's [workflows document](https://docs.google.com/document/d/1rLWPvEuj0Ayc8MR0O1lfRJZfj9av53xU38f20g8nU_E/edit#heading=h.4g0d3mjvb89i) +- [5 July 2023 Slack thread](https://bedfordlab.slack.com/archives/C7SDVPBLZ/p1688577879947079) +- [6 July 2023 team meeting](https://docs.google.com/document/d/1FPfx-ON5RdqL2wyvODhkrCcjgOVX3nlXgBwCPhIEsco/edit) +- _…many others_ diff --git a/ingest/scripts/apply-geolocation-rules.py b/ingest/vendored/apply-geolocation-rules similarity index 93% rename from ingest/scripts/apply-geolocation-rules.py rename to ingest/vendored/apply-geolocation-rules index 1f46d70..2b653be 100755 --- a/ingest/scripts/apply-geolocation-rules.py +++ b/ingest/vendored/apply-geolocation-rules @@ -5,18 +5,6 @@ any additional transformations on top of the user curations. """ -""" -Copied from https://github.com/nextstrain/monkeypox/blob/62fca491c6775573ad036eedf34b271b4952f2c2/ingest/bin/apply-geolocation-rules -with two changes: - -First change allows missing fields in the input ndjson -- annotated_values = transform_geolocations(geolocation_rules, [record.[field] for field in location_fields]) -+ annotated_values = transform_geolocations(geolocation_rules, [record.get(field, '') for field in location_fields]) - -Second change allows blank lines in the location-rules TSV -- if line.lstrip()[0] == '#': -+ if line.strip()=="" or line.lstrip()[0] == '#': -""" import argparse import json