Source code for pudl.metadata.warnings
"""Standard usage warnings to reference in dynamic table descriptions.
For now, these texts are duplicated in ``docs/data_dictionaries/usage_warnings.rst`` --
if you make an update here, be sure to update there as well!
"""
[docs]
USAGE_WARNINGS = {
"multiple_inputs": "Contains information from multiple raw inputs.",
"derived_values": "Contains columns derived from inputs and not originally present in sources.",
"imputed_values": "Contains rows where missing values were imputed.",
"estimated_values": "Contains estimated values.", # TODO: what do we mean here
"incomplete_id_coverage": "Not all IDs are present.", # TODO: do we want to set a coverage threshold and only apply this when we don't meet it?
"incomplete_value_coverage": "?", # TODO: do we mean high rates of missingness? do we want to set a threshold?
"low_coverage": "Table has known low coverage - either geographic or temporal or otherwise.",
"redacted_values": "Some values have been redacted.", # eg 88888
"mixed_aggregations": "Some entries contain aggregates that do not match the table type.", # eg 99999
"month_as_date": "Date column arbitrarily uses the first of the month.",
"no_leap_year": "Date column disregards leap years to comply with Actual/365 (Fixed) standard.",
"irregular_years": "Some years use a slightly different data definition.",
"known_discrepancies": "Contains known calculation discrepancies.",
"free_text": "Contains columns which may appear categorical, but are actually free text.",
"early_release": "May contain early release data.",
"aggregation_hazard": "Some columns contain subtotals; use caution when choosing columns to aggregate.",
"scale_hazard": "Large table; do not attempt to open with Excel.", # TODO: set a threshold
"outliers": "Outliers present.",
"missing_years": "Some years are missing from the data record.",
"ferc_is_hard": (
"FERC data is notoriously difficult to extract cleanly, and often contains free-form strings, "
"non-labeled total rows and lack of IDs. See "
"`Notable Irregularities <https://docs.catalyst.coop/pudl/en/latest/data_sources/ferc1.html#notable-irregularities>`_ "
"for details."
),
"discontinued_data": "The original data is no longer being collected or reported in this way.",
"discontinued_pudl": "PUDL does not currently update its copy of this data.",
"experimental_wip": "This table is experimental and/or a work in progress and may change in the future.",
"harvested": (
"Data has been drawn from several EIA sources which are not always consistent with each other, and PUDL chooses "
"the most consistent or relevant value to facilitate cross-referencing even if that means some values"
" will differ from the raw sources. See "
"`Harvesting <https://docs.catalyst.coop/pudl/en/latest/data_dictionaries/usage_warnings.html#harvested>`_ "
"for details, and see "
"`Entity Resolution Methodology <https://docs.catalyst.coop/pudl/en/latest/methodology/entity_resolution.html>`_ "
"for a fuller conceptual overview."
),
"harvested_rus": ( # TODO: If more attributes harvested, update to refer to static attributes.
"Borrower name data has been drawn from reported values over multiple years and tables of data which are not always consistent with each other. PUDL chooses "
"the most consistent borrower name to facilitate cross-referencing even if that means some values"
" will differ from the raw sources."
),
"harvesting_ingredients": (
"This table is meant for forensic purposes only. It contains all values which were used to "
"choose canonical or golden-record. "
"See `Entity Resolution Methodology <https://docs.catalyst.coop/pudl/en/latest/methodology/entity_resolution.html>`_ "
"for a fuller conceptual overview."
),
}