Skip to content

Commit

Permalink
fix(ingest/json-schema): adding support descriptions for array (datah…
Browse files Browse the repository at this point in the history
  • Loading branch information
AvaniSiddhapuraAPT authored and Alexander Sukhoborov committed Mar 5, 2024
1 parent bed50d6 commit 684e1cc
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 13 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -417,15 +417,35 @@ def _field_from_complex_type(
inner_field_path,
)
elif datahub_field_type == ArrayTypeClass:
field_path = field_path.expand_type("array", schema)
# default items schema is string
field_path = field_path.expand_type(discriminated_type, schema)
yield SchemaField(
fieldPath=field_path.as_string(),
type=type_override or SchemaFieldDataTypeClass(type=ArrayTypeClass()),
nativeDataType=native_type_override
or JsonSchemaTranslator._get_discriminated_type_from_schema(schema),
description=JsonSchemaTranslator._get_description_from_any_schema(
schema
),
nullable=nullable,
jsonProps=JsonSchemaTranslator._get_jsonprops_for_any_schema(
schema, required=required
),
isPartOfKey=field_path.is_key_schema,
)

items_schema = schema.get("items", {"type": "string"})
items_type = JsonSchemaTranslator._get_type_from_schema(items_schema)
field_path._set_parent_type_if_not_exists(
DataHubType(type=ArrayTypeClass, nested_type=items_type)
field_name = items_schema.get("title", None)
if not field_name:
field_name = items_type
inner_field_path = field_path.clone_plus(
FieldElement(type=[], name=field_name, schema_types=[])
)
yield from JsonSchemaTranslator.get_fields(
items_type, items_schema, required=False, base_field_path=field_path
items_type,
items_schema,
required=False,
base_field_path=inner_field_path,
)

elif datahub_field_type == MapTypeClass:
Expand Down
57 changes: 49 additions & 8 deletions metadata-ingestion/tests/unit/schema/test_json_schema_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,15 +153,20 @@ def test_json_schema_with_recursion():
},
}
fields = list(JsonSchemaTranslator.get_fields_from_schema(schema))

expected_field_paths = [
{
"path": "[version=2.0].[type=TreeNode].[type=integer].value",
"type": NumberTypeClass,
},
{
"path": "[version=2.0].[type=TreeNode].[type=array].[type=TreeNode].children",
"path": "[version=2.0].[type=TreeNode].[type=array].children",
"type": ArrayTypeClass,
},
{
"path": "[version=2.0].[type=TreeNode].[type=array].children.[type=TreeNode].TreeNode",
"type": RecordTypeClass,
},
]
assert_field_paths_match(fields, expected_field_paths)
assert_fields_are_valid(fields)
Expand Down Expand Up @@ -372,8 +377,10 @@ def test_nested_arrays():

fields = list(JsonSchemaTranslator.get_fields_from_schema(schema))
expected_field_paths: List[str] = [
"[version=2.0].[type=NestedArray].[type=array].[type=array].[type=Foo].ar",
"[version=2.0].[type=NestedArray].[type=array].[type=array].[type=Foo].ar.[type=integer].a",
"[version=2.0].[type=NestedArray].[type=array].ar",
"[version=2.0].[type=NestedArray].[type=array].ar.[type=array].array",
"[version=2.0].[type=NestedArray].[type=array].ar.[type=array].array.[type=Foo].Foo",
"[version=2.0].[type=NestedArray].[type=array].ar.[type=array].array.[type=Foo].Foo.[type=integer].a",
]
assert_field_paths_match(fields, expected_field_paths)
assert isinstance(fields[0].type.type, ArrayTypeClass)
Expand Down Expand Up @@ -496,14 +503,17 @@ def test_needs_disambiguation_nested_union_of_records_with_same_field_name():
},
}
fields = list(JsonSchemaTranslator.get_fields_from_schema(schema))

expected_field_paths: List[str] = [
"[version=2.0].[type=ABFooUnion].[type=union].a",
"[version=2.0].[type=ABFooUnion].[type=union].[type=A].a",
"[version=2.0].[type=ABFooUnion].[type=union].[type=A].a.[type=string].f",
"[version=2.0].[type=ABFooUnion].[type=union].[type=B].a",
"[version=2.0].[type=ABFooUnion].[type=union].[type=B].a.[type=string].f",
"[version=2.0].[type=ABFooUnion].[type=union].[type=array].[type=array].[type=Foo].a",
"[version=2.0].[type=ABFooUnion].[type=union].[type=array].[type=array].[type=Foo].a.[type=integer].f",
"[version=2.0].[type=ABFooUnion].[type=union].[type=array].a",
"[version=2.0].[type=ABFooUnion].[type=union].[type=array].a.[type=array].array",
"[version=2.0].[type=ABFooUnion].[type=union].[type=array].a.[type=array].array.[type=Foo].Foo",
"[version=2.0].[type=ABFooUnion].[type=union].[type=array].a.[type=array].array.[type=Foo].Foo.[type=integer].f",
]
assert_field_paths_match(fields, expected_field_paths)

Expand Down Expand Up @@ -578,8 +588,10 @@ def test_key_schema_handling():
"[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=A].a.[type=string].f",
"[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=B].a",
"[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=B].a.[type=string].f",
"[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=array].[type=array].[type=Foo].a",
"[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=array].[type=array].[type=Foo].a.[type=number].f",
"[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=array].a",
"[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=array].a.[type=array].array",
"[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=array].a.[type=array].array.[type=Foo].Foo",
"[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=array].a.[type=array].array.[type=Foo].Foo.[type=number].f",
]
assert_field_paths_match(fields, expected_field_paths)
for f in fields:
Expand Down Expand Up @@ -664,7 +676,8 @@ def test_simple_array():

fields = list(JsonSchemaTranslator.get_fields_from_schema(schema))
expected_field_paths: List[str] = [
"[version=2.0].[type=ObjectWithArray].[type=array].[type=string].ar",
"[version=2.0].[type=ObjectWithArray].[type=array].ar",
"[version=2.0].[type=ObjectWithArray].[type=array].ar.[type=string].string",
]
assert_field_paths_match(fields, expected_field_paths)
assert isinstance(fields[0].type.type, ArrayTypeClass)
Expand Down Expand Up @@ -846,3 +859,31 @@ def test_top_level_trival_allof():
assert json.loads(fields[1].jsonProps or "{}")["required"] is False
assert json.loads(fields[2].jsonProps or "{}")["required"] is True
assert json.loads(fields[3].jsonProps or "{}")["required"] is False


def test_description_extraction():
schema = {
"$id": "test",
"$schema": "http://json-schema.org/draft-07/schema#",
"properties": {
"bar": {
"type": "array",
"items": {"type": "string"},
"description": "XYZ",
}
},
}
fields = list(JsonSchemaTranslator.get_fields_from_schema(schema))
expected_field_paths: List[str] = [
"[version=2.0].[type=object].[type=array].bar",
"[version=2.0].[type=object].[type=array].bar.[type=string].string",
]
assert_field_paths_match(fields, expected_field_paths)
assert_fields_are_valid(fields)
# Additional check for the description extraction
array_field = next(
field
for field in fields
if field.fieldPath == "[version=2.0].[type=object].[type=array].bar"
)
assert array_field.description == "XYZ"

0 comments on commit 684e1cc

Please sign in to comment.