georgia-tech-db · adarsh2397 · Oct 20, 2022 · Oct 21, 2022 · Oct 21, 2022 · Oct 21, 2022
diff --git a/eva/binder/statement_binder.py b/eva/binder/statement_binder.py
@@ -26,9 +26,9 @@
 from eva.catalog.catalog_manager import CatalogManager
 from eva.configuration.configuration_manager import ConfigurationManager
 from eva.expression.abstract_expression import AbstractExpression
+from eva.expression.expression_utils import extract_alias_from_function_expression
 from eva.expression.function_expression import FunctionExpression
 from eva.expression.tuple_value_expression import TupleValueExpression
-from eva.parser.alias import Alias
 from eva.parser.create_mat_view_statement import CreateMaterializedViewStatement
 from eva.parser.drop_statement import DropTableStatement
 from eva.parser.load_statement import LoadDataStatement
@@ -227,26 +227,44 @@ def _bind_func_expr(self, node: FunctionExpression):
         for child in node.children:
             self.bind(child)
 
+        # First, check if it refers to any specific model in the catalog
         udf_obj = self._catalog.get_udf_by_name(node.name)
         if udf_obj is None:
-            err_msg = (
-                f"UDF with name {node.name} does not exist in the catalog. "
-                "Please create the UDF using CREATE UDF command."
-            )
-            logger.error(err_msg)
-            raise BinderError(err_msg)
-
-        try:
-            node.function = path_to_class(udf_obj.impl_file_path, udf_obj.name)()
-        except Exception as e:
-            err_msg = (
-                f"{str(e)}. Please verify that the UDF class name in the"
-                "implementation file matches the UDF name."
-            )
-            logger.error(err_msg)
-            raise BinderError(err_msg)
+            # If not, we would want to check if it refers to a type of a model instead
+            if self._catalog.get_udf_by_type(node.name) is None:
+                err_msg = (
+                    f"UDF with name {node.name} does not exist in the catalog. "
+                    "Please create the UDF using CREATE UDF command."
+                )
+                logger.error(err_msg)
+                raise BinderError(err_msg)
+            else:
+                # nothing much to do here since it does not exist
+                node.function = None
+                node.function_type = node.name
+        else:
+            try:
+                node.function = path_to_class(udf_obj.impl_file_path, udf_obj.name)()
+            except Exception as e:
+                err_msg = (
+                    f"{str(e)}. Please verify that the UDF class name in the"
+                    "implementation file matches the UDF name."
+                )
+                logger.error(err_msg)
+                raise BinderError(err_msg)
 
-        output_objs = self._catalog.get_udf_outputs(udf_obj)
+        # we need to populate output_objs even with logical UDF type
+        # we'll use the type name to get any udf_output that matches this type
+        output_objs = None
+        if udf_obj is None:
+            # TODO: Change this method later
+            # 1. Get any UDF ID that matches the logical type
+            # 2. Use this ID to get the output_objs
+            # (Assumption: All UDFs with the same logical type will have same outputs)
+            udf_temp_obj = self._catalog.get_udf_by_type(node.name)
+            output_objs = self._catalog.get_udf_outputs(udf_temp_obj)
+        else:
+            output_objs = self._catalog.get_udf_outputs(udf_obj)
         if node.output:
             for obj in output_objs:
                 if obj.name.lower() == node.output:
@@ -260,23 +278,14 @@ def _bind_func_expr(self, node: FunctionExpression):
             node.output_objs = output_objs
             node.projection_columns = [obj.name.lower() for obj in output_objs]
 
-        default_alias_name = node.name.lower()
-        default_output_col_aliases = [str(obj.name.lower()) for obj in node.output_objs]
-        if not node.alias:
-            node.alias = Alias(default_alias_name, default_output_col_aliases)
-        else:
-            if not len(node.alias.col_names):
-                node.alias = Alias(node.alias.alias_name, default_output_col_aliases)
-            else:
-                output_aliases = [
-                    str(col_name.lower()) for col_name in node.alias.col_names
-                ]
-                node.alias = Alias(node.alias.alias_name, output_aliases)
+        # resolve Alias only if the UDF has been resolved
+        if node.function is not None:
+            node.alias = extract_alias_from_function_expression(node)
 
-        if len(node.alias.col_names) != len(node.output_objs):
-            err_msg = (
-                f"Expected {len(node.output_objs)} output columns for "
-                f"{node.alias.alias_name}, got {len(node.alias.col_names)}."
-            )
-            logger.error(err_msg)
-            raise BinderError(err_msg)
+            if len(node.alias.col_names) != len(node.output_objs):
+                err_msg = (
+                    f"Expected {len(node.output_objs)} output columns for "
+                    f"{node.alias.alias_name}, got {len(node.alias.col_names)}."
+                )
+                logger.error(err_msg)
+                raise BinderError(err_msg)
diff --git a/eva/catalog/catalog_manager.py b/eva/catalog/catalog_manager.py
@@ -240,6 +240,18 @@ def get_udf_by_name(self, name: str) -> UdfMetadata:
         """
         return self._udf_service.udf_by_name(name)
 
+    def get_udf_by_type(self, type: str) -> UdfMetadata:
+        """
+        Get the UDF information based on type.
+
+        Arguments:
+             type (str): type of the UDF
+
+        Returns:
+            UdfMetadata object
+        """
+        return self._udf_service.udf_by_type(type)
+
     def get_udf_inputs(self, udf_obj: UdfMetadata) -> List[UdfIO]:
         if not isinstance(udf_obj, UdfMetadata):
             raise ValueError(

diff --git a/eva/catalog/services/udf_service.py b/eva/catalog/services/udf_service.py
@@ -51,6 +51,16 @@ def udf_by_name(self, name: str):
         except NoResultFound:
             return None
 
+    def udf_by_type(self, name: str):
+        """return the first udf entry that matches the type provided.
+           None if no such entry found.
+
+        Arguments:
+            name (str): name to be searched
+        """
+
+        return self.model.query.filter(self.model._type == name).first()
+
     def udf_by_id(self, id: int):
         """return the udf entry that matches the id provided.
            None if no such entry found.

diff --git a/eva/executor/create_udf_executor.py b/eva/executor/create_udf_executor.py
@@ -46,6 +46,13 @@ def exec(self):
                 msg = f"UDF {self.node.name} already exists."
                 logger.error(msg)
                 raise RuntimeError(msg)
+
+        # TODO: enable this once all existing UDFs have a common type, input and output standard
+        # check catalog if a UDF of the same type already exists
+        if catalog_manager.get_udf_by_type(self.node.udf_type):
+            # check if the inputs and outputs of this type match with existing UDF
+            pass
+
         io_list = []
         io_list.extend(self.node.inputs)
         io_list.extend(self.node.outputs)

diff --git a/eva/expression/expression_utils.py b/eva/expression/expression_utils.py
@@ -18,8 +18,10 @@
 from eva.expression.abstract_expression import AbstractExpression, ExpressionType
 from eva.expression.comparison_expression import ComparisonExpression
 from eva.expression.constant_value_expression import ConstantValueExpression
+from eva.expression.function_expression import FunctionExpression
 from eva.expression.logical_expression import LogicalExpression
 from eva.expression.tuple_value_expression import TupleValueExpression
+from eva.parser.alias import Alias
 
 
 def expression_tree_to_conjunction_list(expression_tree):
@@ -275,3 +277,43 @@ def _has_simple_expressions(expr):
     ]
 
     return _has_simple_expressions(predicate) and contains_single_column(predicate)
+
+
+def extract_alias_from_function_expression(expr: FunctionExpression) -> Alias:
+    """Returns the Alias property for the Function Expression based on its name and output.
+
+    Args:
+        expr (FunctionExpression): Function Expression to process
+
+    Returns:
+        Alias: the alias property to set to the Function Expression
+    """
+    alias: Alias = None
+
+    default_alias_name = expr.name.lower()
+    default_output_col_aliases = [str(obj.name.lower()) for obj in expr.output_objs]
+    if not expr.alias:
+        alias = Alias(default_alias_name, default_output_col_aliases)
+    else:
+        if not len(expr.alias.col_names):
+            alias = Alias(expr.alias.alias_name, default_output_col_aliases)
+        else:
+            output_aliases = [
+                str(col_name.lower()) for col_name in expr.alias.col_names
+            ]
+            alias = Alias(expr.alias.alias_name, output_aliases)
+
+    return alias
+
+
+def is_function_expression(expr: AbstractExpression) -> bool:
+    """Checks if the expr is of type ExpressionType.FUNCTION_EXPRESSION
+
+    Args:
+        expr (AbstractExpression): expression to check
+
+    Returns:
+        bool: True, if it is a FunctionExpression, else False
+    """
+
+    return expr.etype == ExpressionType.FUNCTION_EXPRESSION
diff --git a/eva/expression/function_expression.py b/eva/expression/function_expression.py
@@ -49,12 +49,14 @@ def __init__(
         name: str,
         output: str = None,
         alias: Alias = None,
+        func_type: str = None,
         **kwargs
     ):
 
         super().__init__(ExpressionType.FUNCTION_EXPRESSION, **kwargs)
         self._context = Context()
         self._name = name
+        self._function_type = func_type
         self._function = func
         self._output = output
         self.alias = alias
@@ -73,10 +75,22 @@ def output(self):
     def function(self):
         return self._function
 
+    @property
+    def function_type(self):
+        return self._function_type
+
+    @name.setter
+    def name(self, func_name: str):
+        self._name = func_name
+
     @function.setter
     def function(self, func: Callable):
         self._function = func
 
+    @function_type.setter
+    def function_type(self, func_type: str):
+        self._function_type = func_type
+
     def evaluate(self, batch: Batch, **kwargs) -> Batch:
         new_batch = batch
         child_batches = [child.evaluate(batch, **kwargs) for child in self.children]

diff --git a/eva/optimizer/operators.py b/eva/optimizer/operators.py
@@ -565,7 +565,7 @@ class LogicalCreateUDF(Operator):
             This file should be placed in the UDF directory and
             the path provided should be relative to the UDF dir.
         udf_type: str
-            udf type. it ca be object detection, classification etc.
+            udf type. it can be object detection, classification etc.
     """
 
     def __init__(
@@ -863,6 +863,10 @@ def func_expr(self):
     def do_unnest(self):
         return self._do_unnest
 
+    @func_expr.setter
+    def func_expr(self, expr):
+        self._func_expr = expr
+
     def __eq__(self, other):
         is_subtree_equal = super().__eq__(other)
         if not isinstance(other, LogicalFunctionScan):

diff --git a/eva/optimizer/rules/rules.py b/eva/optimizer/rules/rules.py
@@ -18,7 +18,13 @@
 from enum import Flag, IntEnum, auto
 from typing import TYPE_CHECKING
 
-from eva.expression.expression_utils import conjuction_list_to_expression_tree
+from eva.catalog.catalog_manager import CatalogManager
+from eva.expression.expression_utils import (
+    conjuction_list_to_expression_tree,
+    extract_alias_from_function_expression,
+    is_function_expression,
+)
+from eva.expression.function_expression import FunctionExpression
 from eva.optimizer.optimizer_utils import (
     extract_equi_join_keys,
     extract_pushdown_predicate,
@@ -31,6 +37,7 @@
 from eva.planner.predicate_plan import PredicatePlan
 from eva.planner.project_plan import ProjectPlan
 from eva.planner.show_info_plan import ShowInfoPlan
+from eva.utils.generic_utils import path_to_class
 
 if TYPE_CHECKING:
     from eva.optimizer.optimizer_context import OptimizerContext
@@ -673,6 +680,7 @@ def apply(self, before: LogicalUpload, context: OptimizerContext):
 class LogicalGetToSeqScan(Rule):
     def __init__(self):
         pattern = Pattern(OperatorType.LOGICALGET)
+        self.catalog = CatalogManager()
         super().__init__(RuleType.LOGICAL_GET_TO_SEQSCAN, pattern)
 
     def promise(self):
@@ -692,6 +700,27 @@ def apply(self, before: LogicalGet, context: OptimizerContext):
         )
         if config_batch_mem_size:
             batch_mem_size = config_batch_mem_size
+
+        if before.target_list is not None:
+            for idx, target in enumerate(before.target_list):
+                if is_function_expression(target):
+                    func_expr: FunctionExpression = target
+                    if (
+                        func_expr.function is None
+                        and func_expr.function_type is not None
+                    ):
+                        # TODO: Replace 'get_udf_by_type' with a cost-based selection method
+                        udf_obj = self.catalog.get_udf_by_type(func_expr.function_type)
+                        func_expr.function = path_to_class(
+                            udf_obj.impl_file_path, udf_obj.name
+                        )()
+                        func_expr.name = udf_obj.name
+                        func_expr.alias = extract_alias_from_function_expression(
+                            func_expr
+                        )  # resolve the alias
+
+                        before.target_list[idx] = func_expr
+
         after = SeqScanPlan(None, before.target_list, before.alias)
         after.append_child(
             StoragePlan(
@@ -803,6 +832,7 @@ def apply(self, before: LogicalLimit, context: OptimizerContext):
 class LogicalFunctionScanToPhysical(Rule):
     def __init__(self):
         pattern = Pattern(OperatorType.LOGICALFUNCTIONSCAN)
+        self.catalog = CatalogManager()
         super().__init__(RuleType.LOGICAL_FUNCTION_SCAN_TO_PHYSICAL, pattern)
 
     def promise(self):
@@ -812,6 +842,17 @@ def check(self, before: Operator, context: OptimizerContext):
         return True
 
     def apply(self, before: LogicalFunctionScan, context: OptimizerContext):
+        func_expr: FunctionExpression = before.func_expr
+        if func_expr.function is None and func_expr.function_type is not None:
+            # TODO: Replace 'get_udf_by_type' with a cost-based selection method
+            udf_obj = self.catalog.get_udf_by_type(func_expr.function_type)
+            func_expr.function = path_to_class(udf_obj.impl_file_path, udf_obj.name)()
+            func_expr.name = udf_obj.name
+            func_expr.alias = extract_alias_from_function_expression(
+                func_expr
+            )  # resolve the alias as well
+            before.func_expr = func_expr
+
         after = FunctionScanPlan(before.func_expr, before.do_unnest)
         return after
 

diff --git a/eva/udfs/udf_bootstrap_queries.py b/eva/udfs/udf_bootstrap_queries.py
@@ -71,7 +71,7 @@
       INPUT  (Frame_Array NDARRAY UINT8(3, ANYDIM, ANYDIM))
       OUTPUT (labels NDARRAY STR(ANYDIM), bboxes NDARRAY FLOAT32(ANYDIM, 4),
                 scores NDARRAY FLOAT32(ANYDIM))
-      TYPE  Classification
+      TYPE  ObjectDetection
       IMPL  '{}/udfs/fastrcnn_object_detector.py';
       """.format(
     EVA_INSTALLATION_DIR

diff --git a/test/catalog/test_catalog_manager.py b/test/catalog/test_catalog_manager.py
@@ -168,6 +168,13 @@ def test_get_udf_by_name(self, udf_mock):
         udf_mock.return_value.udf_by_name.assert_called_with("name")
         self.assertEqual(actual, udf_mock.return_value.udf_by_name.return_value)
 
+    @mock.patch("eva.catalog.catalog_manager.UdfService")
+    def test_get_udf_by_type(self, udf_mock):
+        catalog = CatalogManager()
+        actual = catalog.get_udf_by_type("type")
+        udf_mock.return_value.udf_by_type.assert_called_with("type")
+        self.assertEqual(actual, udf_mock.return_value.udf_by_type.return_value)
+
     @mock.patch("eva.catalog.catalog_manager.UdfService")
     def test_drop_udf(self, udf_mock):
         CatalogManager().drop_udf("name")

diff --git a/test/integration_tests/test_select_executor.py b/test/integration_tests/test_select_executor.py
@@ -278,6 +278,14 @@ def test_lateral_join(self):
         self.assertEqual(list(actual_batch.columns), ["myvideo.id", "T.a"])
         self.assertEqual(len(actual_batch), 5)
 
+    @pytest.mark.torchtest
+    def test_lateral_join_with_logical_udf(self):
+        select_query = """SELECT id, a FROM MyVideo JOIN LATERAL
+                        ObjectDetection(data) AS T(a,b,c) WHERE id < 5;"""
+        actual_batch = execute_query_fetch_all(select_query)
+        self.assertEqual(list(actual_batch.columns), ["myvideo.id", "T.a"])
+        self.assertEqual(len(actual_batch), 5)
+
     @pytest.mark.torchtest
     def test_lateral_join_with_multiple_projects(self):
         select_query = """SELECT id, T.labels FROM MyVideo JOIN LATERAL