Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 58 additions & 3 deletions pygbif/occurrences/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,47 @@
)


# Keys that should have integer values in the GBIF API
# Taxonomic keys from GBIF's backbone taxonomy that should be integers
# Note: Other keys like DATASET_KEY, NETWORK_KEY are UUIDs and should remain strings
taxonomic_integer_keys = {
"TAXON_KEY", "SPECIES_KEY", "GENUS_KEY", "FAMILY_KEY",
"ORDER_KEY", "CLASS_KEY", "PHYLUM_KEY", "KINGDOM_KEY",
Comment on lines +26 to +30
Copy link

Copilot AI Mar 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The comments/variable name imply these are strictly taxonomic backbone keys, but the set also includes YEAR and MONTH. To avoid confusion for future maintenance, either rename this set to something like integer_predicate_keys (or similar) or adjust the comment to match its actual contents.

Suggested change
# Taxonomic keys from GBIF's backbone taxonomy that should be integers
# Note: Other keys like DATASET_KEY, NETWORK_KEY are UUIDs and should remain strings
taxonomic_integer_keys = {
"TAXON_KEY", "SPECIES_KEY", "GENUS_KEY", "FAMILY_KEY",
"ORDER_KEY", "CLASS_KEY", "PHYLUM_KEY", "KINGDOM_KEY",
# Includes taxonomic backbone keys (e.g. TAXON_KEY, SPECIES_KEY) and other
# integer-valued predicates such as YEAR and MONTH. Other keys like
# DATASET_KEY, NETWORK_KEY are UUIDs and should remain strings.
taxonomic_integer_keys = {
"TAXON_KEY", "SPECIES_KEY", "GENUS_KEY", "FAMILY_KEY",
"ORDER_KEY", "CLASS_KEY", "PHYLUM_KEY", "KINGDOM_KEY",

Copilot uses AI. Check for mistakes.
"SUBGENUS_KEY", "YEAR", "MONTH"
}

def _convert_to_int(value, key_name):
"""
Convert a value to integer, handling float-like strings (e.g., "5785887.0").

:param value: The value to convert
:param key_name: The name of the key (for logging purposes)
:return: Integer value if conversion successful, otherwise raises ValueError
"""
if not isinstance(value, str):
return int(value)

try:
# First try direct integer conversion
return int(value)
except ValueError:
# If that fails, try converting via float (handles "5785887.0")
try:
float_val = float(value)
int_val = int(float_val)
# Check if it's actually an integer value
if float_val != int_val:
logging.warning(
f"Converting non-integer value {value} to {int_val} for {key_name}. "
f"Fractional part {float_val - int_val} will be lost."
)
return int_val
except ValueError:
raise ValueError(
f"Cannot convert value '{value}' to integer for {key_name}. "
f"Expected a numeric value."
)

# how to parse arguments/predicates
def _parse_args(x):
x = x.replace("'", '"')
Expand All @@ -45,13 +86,27 @@ def _parse_args(x):
"error: in predicate has to be associated with a list in square brackets (for example [1, 2, 3])"
)
else:
return {"type": "in", "key": key, "values": json.loads(value_list.group(0))}
values = json.loads(value_list.group(0))
# Convert taxonomic key values to integers
if key in taxonomic_integer_keys:
converted_values = []
for v in values:
try:
converted_values.append(_convert_to_int(v, key))
except ValueError as e:
raise ValueError(f"Error in 'in' predicate for {key}: {str(e)}")
Copy link

Copilot AI Mar 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When re-raising conversion failures in the 'in' predicate branch, the original exception context is dropped. Using exception chaining (raise ... from e) would preserve the underlying cause/traceback and make debugging malformed lists easier.

Suggested change
raise ValueError(f"Error in 'in' predicate for {key}: {str(e)}")
raise ValueError(f"Error in 'in' predicate for {key}: {str(e)}") from e

Copilot uses AI. Check for mistakes.
values = converted_values
return {"type": "in", "key": key, "values": values}
Comment on lines +89 to +99
Copy link

Copilot AI Mar 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The integer coercion for in predicates will raise ValueError for common “float-like” inputs such as "5785887.0" (the reported issue) because str.isdigit() is false and the else branch still does int(v) without any guarding. Consider using a single safe coercion routine that accepts int, float values with .is_integer(), and strings matching ^\d+(\.0+)?$ (or Decimal) and otherwise leaves the value unchanged or raises a clearer error.

Copilot uses AI. Check for mistakes.
pred_type = operator_lkup.get(tmp[1])
value = tmp[2]
# Convert taxonomic key values to integers
if key in taxonomic_integer_keys:
value = _convert_to_int(value, key)
return {
"type": pred_type,
"key": key,
"value": tmp[2],
} # does not work for in, within, geodistance, not, like, isnull and isnotnull predicate values
"value": value,
}


def _check_environ(variable, value):
Expand Down
167 changes: 167 additions & 0 deletions test/test-occurrences-download_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,3 +196,170 @@ def test_geometry_predicate(self):
payload["predicate"]["predicates"][0],
{"type": "within", "geometry": "POLYGON((-82.7 36.9, -85.0 35.6, -81.0 33.5, -79.4 36.3, -79.4 36.3, -82.7 36.9))"},
)

def test_taxonkey_integer_conversion(self):
"""Test that taxonKey values are converted to integers (issue #179)"""
dl_key, payload = download(
"taxonKey = 3119195",
user="dummy",
email="dummy",
pwd="dummy",
)
# taxonKey value should be an integer, not a string
self.assertDictEqual(
payload["predicate"]["predicates"][0],
{"key": "TAXON_KEY", "type": "equals", "value": 3119195},
)
Comment on lines +200 to +212
Copy link

Copilot AI Mar 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The added tests validate integer conversion for clean integer strings, but they don’t cover the reported failing case where taxonKey comes through as a float-like string (e.g., "5785887.0"). Adding an assertion for that exact input (both = and in [...]) would prevent regressions and ensure issue #179 is actually fixed.

Copilot uses AI. Check for mistakes.
self.assertIsInstance(payload["predicate"]["predicates"][0]["value"], int)

def test_taxonkey_in_predicate_integer_conversion(self):
"""Test that taxonKey values in 'in' predicate are converted to integers"""
dl_key, payload = download(
'taxonKey in ["2387246", "2399391", "2364604"]',
user="dummy",
email="dummy",
pwd="dummy",
)
# taxonKey values should be integers in the list
self.assertDictEqual(
payload["predicate"]["predicates"][0],
{"key": "TAXON_KEY", "type": "in", "values": [2387246, 2399391, 2364604]},
)
for val in payload["predicate"]["predicates"][0]["values"]:
self.assertIsInstance(val, int)

def test_year_integer_conversion(self):
"""Test that year values are converted to integers"""
dl_key, payload = download(
"year = 2023",
user="dummy",
email="dummy",
pwd="dummy",
)
self.assertDictEqual(
payload["predicate"]["predicates"][0],
{"key": "YEAR", "type": "equals", "value": 2023},
)
self.assertIsInstance(payload["predicate"]["predicates"][0]["value"], int)

def test_depth_numeric_conversion(self):
Copy link

Copilot AI Mar 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Test name "test_depth_numeric_conversion" conflicts with its docstring/behavior (it asserts depth stays a string and is not converted). Consider renaming the test to reflect the intent (e.g., that non-integer numeric-like fields remain strings) to keep the suite self-explanatory.

Suggested change
def test_depth_numeric_conversion(self):
def test_depth_numeric_like_value_remains_string(self):

Copilot uses AI. Check for mistakes.
"""Test that depth values are kept as strings (not converted)"""
dl_key, payload = download(
"depth = 80.5",
user="dummy",
email="dummy",
pwd="dummy",
)
self.assertDictEqual(
payload["predicate"]["predicates"][0],
{"key": "DEPTH", "type": "equals", "value": "80.5"},
)
self.assertIsInstance(payload["predicate"]["predicates"][0]["value"], str)

def test_multiple_predicates_with_integer_keys(self):
"""Test multiple predicates with integer key conversion"""
dl_key, payload = download(
["taxonKey = 7264332", "year = 2020"],
user="dummy",
email="dummy",
pwd="dummy",
)
temp_pred = payload["predicate"]["predicates"]
# Both values should be integers
self.assertEqual(temp_pred[0]["value"], 7264332)
self.assertEqual(temp_pred[1]["value"], 2020)
self.assertIsInstance(temp_pred[0]["value"], int)
self.assertIsInstance(temp_pred[1]["value"], int)

def test_datasetkey_remains_string(self):
"""Test that datasetKey values remain as strings (UUIDs, not converted)"""
dl_key, payload = download(
"datasetKey = 50c9509d-22c7-4a22-a47d-8c48425ef4a7",
user="dummy",
email="dummy",
pwd="dummy",
)
# datasetKey should remain as a string (UUID format)
self.assertDictEqual(
payload["predicate"]["predicates"][0],
{"key": "DATASET_KEY", "type": "equals", "value": "50c9509d-22c7-4a22-a47d-8c48425ef4a7"},
)
self.assertIsInstance(payload["predicate"]["predicates"][0]["value"], str)

def test_datasetkey_in_predicate_remains_string(self):
"""Test that datasetKey values in 'in' predicate remain as strings"""
dl_key, payload = download(
'datasetKey in ["50c9509d-22c7-4a22-a47d-8c48425ef4a7", "7b5d6a48-f762-11e1-a439-00145eb45e9a"]',
user="dummy",
email="dummy",
pwd="dummy",
)
# datasetKey values should remain as strings (UUIDs)
self.assertDictEqual(
payload["predicate"]["predicates"][0],
{
"key": "DATASET_KEY",
"type": "in",
"values": ["50c9509d-22c7-4a22-a47d-8c48425ef4a7", "7b5d6a48-f762-11e1-a439-00145eb45e9a"]
},
)
for val in payload["predicate"]["predicates"][0]["values"]:
self.assertIsInstance(val, str)

def test_mixed_taxonomic_and_uuid_keys(self):
"""Test that taxonomic keys are converted to int but UUID keys remain strings"""
dl_key, payload = download(
["taxonKey = 3119195", "datasetKey = 50c9509d-22c7-4a22-a47d-8c48425ef4a7"],
user="dummy",
email="dummy",
pwd="dummy",
)
temp_pred = payload["predicate"]["predicates"]
# taxonKey should be integer
self.assertEqual(temp_pred[0]["value"], 3119195)
self.assertIsInstance(temp_pred[0]["value"], int)
# datasetKey should be string (UUID)
self.assertEqual(temp_pred[1]["value"], "50c9509d-22c7-4a22-a47d-8c48425ef4a7")
self.assertIsInstance(temp_pred[1]["value"], str)

def test_taxonkey_float_string_conversion(self):
"""Test that float-like strings (e.g., '5785887.0') are converted to integers"""
dl_key, payload = download(
"taxonKey = 5785887.0",
user="dummy",
email="dummy",
pwd="dummy",
)
# Should convert "5785887.0" to integer 5785887
self.assertDictEqual(
payload["predicate"]["predicates"][0],
{"key": "TAXON_KEY", "type": "equals", "value": 5785887},
)
self.assertIsInstance(payload["predicate"]["predicates"][0]["value"], int)

def test_taxonkey_in_predicate_float_string_conversion(self):
"""Test that float-like strings in 'in' predicates are converted to integers"""
dl_key, payload = download(
'taxonKey in ["2387246.0", "2399391.0", "2364604"]',
user="dummy",
email="dummy",
pwd="dummy",
)
# All values should be converted to integers
self.assertDictEqual(
payload["predicate"]["predicates"][0],
{"key": "TAXON_KEY", "type": "in", "values": [2387246, 2399391, 2364604]},
)
for val in payload["predicate"]["predicates"][0]["values"]:
self.assertIsInstance(val, int)

def test_taxonkey_invalid_value_raises_error(self):
"""Test that non-numeric values for taxonomic keys raise an error"""
with self.assertRaises(ValueError) as context:
download(
"taxonKey = notanumber",
user="dummy",
email="dummy",
pwd="dummy",
)
self.assertIn("Cannot convert", str(context.exception))
Loading