From 8e1b995859915a007871f33763fe329fe47eb111 Mon Sep 17 00:00:00 2001 From: Simon Breuer <86068340+sibre28@users.noreply.github.com> Date: Thu, 27 Jun 2024 09:09:54 +0200 Subject: [PATCH] feat: easily create a baseline model (#811) Closes #710 ### Summary of Changes Added BaselineClassifier, BaselineRegressor and test cases --------- Co-authored-by: megalinter-bot <129584137+megalinter-bot@users.noreply.github.com> --- src/safeds/exceptions/__init__.py | 2 + src/safeds/exceptions/_ml.py | 26 ++- .../ml/classical/classification/__init__.py | 3 + .../classification/_baseline_classifier.py | 188 ++++++++++++++++ .../ml/classical/regression/__init__.py | 3 + .../regression/_baseline_regressor.py | 202 ++++++++++++++++++ .../test_baseline_classifier.py | 85 ++++++++ .../regression/test_baseline_regressor.py | 85 ++++++++ tests/safeds/ml/nn/test_model.py | 8 +- 9 files changed, 595 insertions(+), 7 deletions(-) create mode 100644 src/safeds/ml/classical/classification/_baseline_classifier.py create mode 100644 src/safeds/ml/classical/regression/_baseline_regressor.py create mode 100644 tests/safeds/ml/classical/classification/test_baseline_classifier.py create mode 100644 tests/safeds/ml/classical/regression/test_baseline_regressor.py diff --git a/src/safeds/exceptions/__init__.py b/src/safeds/exceptions/__init__.py index 2f84387c9..dabbc3afa 100644 --- a/src/safeds/exceptions/__init__.py +++ b/src/safeds/exceptions/__init__.py @@ -24,6 +24,7 @@ ModelNotFittedError, PlainTableError, PredictionError, + TargetDataMismatchError, ) @@ -69,6 +70,7 @@ class OutOfBoundsError(SafeDsError): # ML exceptions "DatasetMissesDataError", "DatasetMissesFeaturesError", + "TargetDataMismatchError", "FeatureDataMismatchError", "InvalidFitDataError", "InputSizeError", diff --git a/src/safeds/exceptions/_ml.py b/src/safeds/exceptions/_ml.py index 649ea0455..b7600df34 100644 --- a/src/safeds/exceptions/_ml.py +++ b/src/safeds/exceptions/_ml.py @@ -15,6 +15,26 @@ def __init__(self, missing_feature_names: list[str]): super().__init__(f"Dataset misses the feature columns '{missing_feature_names}'.") +class TargetDataMismatchError(ValueError): + """ + Raised when the target column of a test dataset mismatches with the target column of the training dataset. + + Currently only used in the Baseline Models. + + Parameters + ---------- + actual_target_name: + The actual target column of the dataset. + missing_target_name: + The name of the missing target column. + """ + + def __init__(self, actual_target_name: str, missing_target_name: str): + super().__init__( + f"The provided target column '{actual_target_name}' does not match the target column of the training set '{missing_target_name}'.", + ) + + class DatasetMissesDataError(ValueError): """Raised when a dataset contains no rows.""" @@ -72,16 +92,16 @@ def __init__(self, reason: str): class FeatureDataMismatchError(Exception): - """Raised when the columns of the table passed to the predict or fit method do not match with the specified features of the neural network.""" + """Raised when the columns of the table passed to the predict or fit method do not match with the specified features of the model.""" def __init__(self) -> None: super().__init__( - "The features in the given table do not match with the specified feature columns names of the neural network.", + "The features in the given table do not match with the specified feature columns names of the model.", ) class InputSizeError(Exception): - """Raised when the amount of features being passed to a network does not match with its input size.""" + """Raised when the amount of features being passed to a model does not match with its input size.""" def __init__(self, data_size: int | ModelImageSize, input_layer_size: int | ModelImageSize | None) -> None: # TODO: remove input_layer_size type None again diff --git a/src/safeds/ml/classical/classification/__init__.py b/src/safeds/ml/classical/classification/__init__.py index 6ad258333..cff7ecbc2 100644 --- a/src/safeds/ml/classical/classification/__init__.py +++ b/src/safeds/ml/classical/classification/__init__.py @@ -6,6 +6,7 @@ if TYPE_CHECKING: from ._ada_boost_classifier import AdaBoostClassifier + from ._baseline_classifier import BaselineClassifier from ._classifier import Classifier from ._decision_tree_classifier import DecisionTreeClassifier from ._gradient_boosting_classifier import GradientBoostingClassifier @@ -18,6 +19,7 @@ __name__, { "AdaBoostClassifier": "._ada_boost_classifier:AdaBoostClassifier", + "BaselineClassifier": "._baseline_classifier:BaselineClassifier", "Classifier": "._classifier:Classifier", "DecisionTreeClassifier": "._decision_tree_classifier:DecisionTreeClassifier", "GradientBoostingClassifier": "._gradient_boosting_classifier:GradientBoostingClassifier", @@ -30,6 +32,7 @@ __all__ = [ "AdaBoostClassifier", + "BaselineClassifier", "Classifier", "DecisionTreeClassifier", "GradientBoostingClassifier", diff --git a/src/safeds/ml/classical/classification/_baseline_classifier.py b/src/safeds/ml/classical/classification/_baseline_classifier.py new file mode 100644 index 000000000..7b58d61e2 --- /dev/null +++ b/src/safeds/ml/classical/classification/_baseline_classifier.py @@ -0,0 +1,188 @@ +import copy +from concurrent.futures import ALL_COMPLETED, wait +from typing import Self + +from safeds._validation._check_columns_are_numeric import _check_columns_are_numeric +from safeds.data.labeled.containers import TabularDataset +from safeds.exceptions import ( + DatasetMissesDataError, + FeatureDataMismatchError, + ModelNotFittedError, + TargetDataMismatchError, +) +from safeds.ml.classical.classification import ( + AdaBoostClassifier, + Classifier, + DecisionTreeClassifier, + GradientBoostingClassifier, + RandomForestClassifier, + SupportVectorClassifier, +) + + +def _fit_single_model(model: Classifier, train_data: TabularDataset) -> Classifier: + return model.fit(train_data) # pragma: no cover + + +def _predict_single_model(model: Classifier, test_data: TabularDataset) -> TabularDataset: + return model.predict(test_data) # pragma: no cover + + +class BaselineClassifier: + """ + Baseline Classifier. + + Get a baseline by fitting data on multiple different models and comparing the best metrics. + + Parameters ---------- extended_search: If set to true, an extended set of models will be used to fit the + classifier. This might result in significantly higher runtime. + """ + + def __init__(self, extended_search: bool = False): + self._is_fitted = False + self._list_of_model_types = [ + AdaBoostClassifier(), + DecisionTreeClassifier(), + SupportVectorClassifier(), + RandomForestClassifier(), + ] + if extended_search: + self._list_of_model_types.extend([GradientBoostingClassifier()]) # pragma: no cover + + self._fitted_models: list[Classifier] = [] + self._feature_names: list[str] | None = None + self._target_name: str = "none" + + def fit(self, train_data: TabularDataset) -> Self: + """ + Train the Classifier with given training data. + + The original model is not modified. + + Parameters + ---------- + train_data: + The data the network should be trained on. + + Returns + ------- + trained_classifier: + The trained Classifier + + Raises + ------ + DatasetMissesDataError + If the given train_data contains no data. + ColumnTypeError + If one or more columns contain non-numeric values. + """ + from concurrent.futures import ProcessPoolExecutor + + # Validate Data + train_data_as_table = train_data.to_table() + if train_data_as_table.row_count == 0: + raise DatasetMissesDataError + _check_columns_are_numeric(train_data_as_table, train_data.features.add_columns(train_data.target).column_names) + + copied_model = copy.deepcopy(self) + + with ProcessPoolExecutor(max_workers=len(self._list_of_model_types)) as executor: + futures = [] + for model in self._list_of_model_types: + futures.append(executor.submit(_fit_single_model, model, train_data)) + [done, _] = wait(futures, return_when=ALL_COMPLETED) + for future in done: + copied_model._fitted_models.append(future.result()) + executor.shutdown() + + copied_model._is_fitted = True + copied_model._feature_names = train_data.features.column_names + copied_model._target_name = train_data.target.name + return copied_model + + def predict(self, test_data: TabularDataset) -> dict[str, float]: + """ + Make a prediction for the given test data and calculate the best metrics. + + The original Model is not modified. + + Parameters + ---------- + test_data: + The data the Classifier should predict. + + Returns + ------- + best_metrics: + A dictionary with the best metrics that were achieved. + + Raises + ------ + ModelNotFittedError + If the model has not been fitted yet + FeatureDataMismatchError + If the features of the test data do not match with the features of the trained Classifier. + DatasetMissesDataError + If the given test_data contains no data. + TargetDataMismatchError + If the target column of the test data does not match the target column of the training data. + ColumnTypeError + If one or more columns contain non-numeric values. + """ + from concurrent.futures import ProcessPoolExecutor + + from safeds.ml.metrics import ClassificationMetrics + + if not self._is_fitted: + raise ModelNotFittedError + + # Validate data + if not self._feature_names == test_data.features.column_names: + raise FeatureDataMismatchError + if not self._target_name == test_data.target.name: + raise TargetDataMismatchError( + actual_target_name=test_data.target.name, + missing_target_name=self._target_name, + ) + test_data_as_table = test_data.to_table() + if test_data_as_table.row_count == 0: + raise DatasetMissesDataError + _check_columns_are_numeric(test_data_as_table, test_data.features.add_columns(test_data.target).column_names) + + with ProcessPoolExecutor(max_workers=len(self._list_of_model_types)) as executor: + results = [] + futures = [] + for model in self._fitted_models: + futures.append(executor.submit(_predict_single_model, model, test_data)) + [done, _] = wait(futures, return_when=ALL_COMPLETED) + for future in done: + results.append(future.result()) + executor.shutdown() + + max_metrics = {"accuracy": 0.0, "f1score": 0.0, "precision": 0.0, "recall": 0.0} + for result in results: + accuracy = ClassificationMetrics.accuracy(result, test_data) + + positive_class = test_data.target.get_value(0) + f1score = ClassificationMetrics.f1_score(result, test_data, positive_class) + precision = ClassificationMetrics.precision(result, test_data, positive_class) + recall = ClassificationMetrics.recall(result, test_data, positive_class) + + if max_metrics.get("accuracy", 0.0) < accuracy: + max_metrics.update({"accuracy": accuracy}) + + if max_metrics.get("f1score", 0.0) < f1score: + max_metrics.update({"f1score": f1score}) + + if max_metrics.get("precision", 0.0) < precision: + max_metrics.update({"precision": precision}) + + if max_metrics.get("recall", 0.0) < recall: + max_metrics.update({"recall": recall}) + + return max_metrics + + @property + def is_fitted(self) -> bool: + """Whether the model is fitted.""" + return self._is_fitted diff --git a/src/safeds/ml/classical/regression/__init__.py b/src/safeds/ml/classical/regression/__init__.py index ed8c2bcbb..1dd3f627a 100644 --- a/src/safeds/ml/classical/regression/__init__.py +++ b/src/safeds/ml/classical/regression/__init__.py @@ -7,6 +7,7 @@ if TYPE_CHECKING: from ._ada_boost_regressor import AdaBoostRegressor from ._arima import ArimaModelRegressor + from ._baseline_regressor import BaselineRegressor from ._decision_tree_regressor import DecisionTreeRegressor from ._elastic_net_regressor import ElasticNetRegressor from ._gradient_boosting_regressor import GradientBoostingRegressor @@ -23,6 +24,7 @@ { "AdaBoostRegressor": "._ada_boost_regressor:AdaBoostRegressor", "ArimaModelRegressor": "._arima:ArimaModelRegressor", + "BaselineRegressor": "._baseline_regressor:BaselineRegressor", "DecisionTreeRegressor": "._decision_tree_regressor:DecisionTreeRegressor", "ElasticNetRegressor": "._elastic_net_regressor:ElasticNetRegressor", "GradientBoostingRegressor": "._gradient_boosting_regressor:GradientBoostingRegressor", @@ -39,6 +41,7 @@ __all__ = [ "AdaBoostRegressor", "ArimaModelRegressor", + "BaselineRegressor", "DecisionTreeRegressor", "ElasticNetRegressor", "GradientBoostingRegressor", diff --git a/src/safeds/ml/classical/regression/_baseline_regressor.py b/src/safeds/ml/classical/regression/_baseline_regressor.py new file mode 100644 index 000000000..4562ed122 --- /dev/null +++ b/src/safeds/ml/classical/regression/_baseline_regressor.py @@ -0,0 +1,202 @@ +import copy +from concurrent.futures import ALL_COMPLETED, wait +from typing import Self + +from safeds._validation._check_columns_are_numeric import _check_columns_are_numeric +from safeds.data.labeled.containers import TabularDataset +from safeds.exceptions import ( + DatasetMissesDataError, + FeatureDataMismatchError, + ModelNotFittedError, + TargetDataMismatchError, +) +from safeds.ml.classical.regression import ( + AdaBoostRegressor, + DecisionTreeRegressor, + ElasticNetRegressor, + GradientBoostingRegressor, + LassoRegressor, + LinearRegressor, + RandomForestRegressor, + Regressor, + RidgeRegressor, + SupportVectorRegressor, +) + + +def _fit_single_model(model: Regressor, train_data: TabularDataset) -> Regressor: + return model.fit(train_data) # pragma: no cover + + +def _predict_single_model(model: Regressor, test_data: TabularDataset) -> TabularDataset: + return model.predict(test_data) # pragma: no cover + + +class BaselineRegressor: + """ + Baseline Regressor. + + Get a baseline by fitting data on multiple different models and comparing the best metrics. + + Parameters ---------- extended_search: If set to true, an extended set of models will be used to fit the + classifier. This might result in significantly higher runtime. + """ + + def __init__(self, include_slower_models: bool = False): + self._is_fitted = False + self._list_of_model_types = [ + AdaBoostRegressor(), + DecisionTreeRegressor(), + LinearRegressor(), + RandomForestRegressor(), + RidgeRegressor(), + SupportVectorRegressor(), + ] + + if include_slower_models: + self._list_of_model_types.extend( + [ElasticNetRegressor(), LassoRegressor(), GradientBoostingRegressor()], + ) # pragma: no cover + + self._fitted_models: list[Regressor] = [] + self._feature_names: list[str] | None = None + self._target_name: str = "none" + + def fit(self, train_data: TabularDataset) -> Self: + """ + Train the Regressor with given training data. + + The original model is not modified. + + Parameters + ---------- + train_data: + The data the network should be trained on. + + Returns + ------- + trained_classifier: + The trained Regressor + + Raises + ------ + DatasetMissesDataError + If the given train_data contains no data. + ColumnTypeError + If one or more columns contain non-numeric values. + """ + from concurrent.futures import ProcessPoolExecutor + + # Validate Data + train_data_as_table = train_data.to_table() + if train_data_as_table.row_count == 0: + raise DatasetMissesDataError + _check_columns_are_numeric(train_data_as_table, train_data.features.add_columns(train_data.target).column_names) + + copied_model = copy.deepcopy(self) + + with ProcessPoolExecutor(max_workers=len(self._list_of_model_types)) as executor: + futures = [] + for model in self._list_of_model_types: + futures.append(executor.submit(_fit_single_model, model, train_data)) + [done, _] = wait(futures, return_when=ALL_COMPLETED) + for future in done: + copied_model._fitted_models.append(future.result()) + executor.shutdown() + + copied_model._is_fitted = True + copied_model._feature_names = train_data.features.column_names + copied_model._target_name = train_data.target.name + return copied_model + + def predict(self, test_data: TabularDataset) -> dict[str, float]: + """ + Make a prediction for the given test data and calculate the best metrics. + + The original Model is not modified. + + Parameters + ---------- + test_data: + The data the Regressor should predict. + + Returns + ------- + best_metrics: + A dictionary with the best metrics that were achieved. + + Raises + ------ + ModelNotFittedError + If the model has not been fitted yet + FeatureDataMismatchError + If the features of the test data do not match with the features of the trained Regressor. + DatasetMissesDataError + If the given test_data contains no data. + TargetDataMismatchError + If the target column of the test data does not match the target column of the training data. + ColumnTypeError + If one or more columns contain non-numeric values. + """ + from concurrent.futures import ProcessPoolExecutor + + from safeds.ml.metrics import RegressionMetrics + + if not self._is_fitted: + raise ModelNotFittedError + + # Validate data + if not self._feature_names == test_data.features.column_names: + raise FeatureDataMismatchError + if not self._target_name == test_data.target.name: + raise TargetDataMismatchError( + actual_target_name=test_data.target.name, + missing_target_name=self._target_name, + ) + test_data_as_table = test_data.to_table() + if test_data_as_table.row_count == 0: + raise DatasetMissesDataError + _check_columns_are_numeric(test_data_as_table, test_data.features.add_columns(test_data.target).column_names) + + # Start Processes + with ProcessPoolExecutor(max_workers=len(self._list_of_model_types)) as executor: + results = [] + futures = [] + for model in self._fitted_models: + futures.append(executor.submit(_predict_single_model, model, test_data)) + [done, _] = wait(futures, return_when=ALL_COMPLETED) + for future in done: + results.append(future.result()) + executor.shutdown() + + # Calculate Metrics + max_metrics = { + "coefficient_of_determination": float("-inf"), + "mean_absolute_error": float("inf"), + "mean_squared_error": float("inf"), + "median_absolute_deviation": float("inf"), + } + for result in results: + coefficient_of_determination = RegressionMetrics.coefficient_of_determination(result, test_data) + mean_absolute_error = RegressionMetrics.mean_absolute_error(result, test_data) + mean_squared_error = RegressionMetrics.mean_squared_error(result, test_data) + median_absolute_deviation = RegressionMetrics.median_absolute_deviation(result, test_data) + + if max_metrics.get("coefficient_of_determination", float("-inf")) < coefficient_of_determination: + max_metrics.update({"coefficient_of_determination": coefficient_of_determination}) + + if max_metrics.get("mean_absolute_error", float("inf")) > mean_absolute_error: + max_metrics.update({"mean_absolute_error": mean_absolute_error}) + + if max_metrics.get("mean_squared_error", float("inf")) > mean_squared_error: + max_metrics.update({"mean_squared_error": mean_squared_error}) + + if max_metrics.get("median_absolute_deviation", float("inf")) > median_absolute_deviation: + max_metrics.update({"median_absolute_deviation": median_absolute_deviation}) + + return max_metrics + + @property + def is_fitted(self) -> bool: + """Whether the model is fitted.""" + return self._is_fitted diff --git a/tests/safeds/ml/classical/classification/test_baseline_classifier.py b/tests/safeds/ml/classical/classification/test_baseline_classifier.py new file mode 100644 index 000000000..8f507c41a --- /dev/null +++ b/tests/safeds/ml/classical/classification/test_baseline_classifier.py @@ -0,0 +1,85 @@ +import pytest +from safeds.data.tabular.containers import Table +from safeds.exceptions import ( + ColumnTypeError, + DatasetMissesDataError, + FeatureDataMismatchError, + ModelNotFittedError, + TargetDataMismatchError, +) +from safeds.ml.classical.classification import BaselineClassifier + + +class TestBaselineClassifier: + def test_should_raise_if_fit_dataset_contains_no_data(self) -> None: + model = BaselineClassifier() + data = Table({"feat": [], "target": []}).to_tabular_dataset("target") + with pytest.raises(DatasetMissesDataError): + model.fit(data) + + def test_should_raise_if_predict_dataset_contains_no_data(self) -> None: + model = BaselineClassifier() + fit_data = Table({"feat": [0, 1], "target": [0, 1]}).to_tabular_dataset("target") + predict_data = Table({"feat": [], "target": []}).to_tabular_dataset("target") + model = model.fit(fit_data) + with pytest.raises(DatasetMissesDataError): + model.predict(predict_data) + + def test_should_raise_if_fit_dataset_contains_non_numerical_columns(self) -> None: + model = BaselineClassifier() + data = Table({"feat": ["a", "b"], "target": [0, 1]}).to_tabular_dataset("target") + with pytest.raises(ColumnTypeError): + model.fit(data) + + def test_should_raise_if_predict_dataset_contains_non_numerical_columns(self) -> None: + model = BaselineClassifier() + fit_data = Table({"feat": [0, 1], "target": [0, 1]}).to_tabular_dataset("target") + predict_data = Table({"feat": ["zero", "one"], "target": [0, 1]}).to_tabular_dataset("target") + model = model.fit(fit_data) + with pytest.raises(ColumnTypeError): + model.predict(predict_data) + + def test_should_check_that_fit_returns_baseline_classifier(self) -> None: + model = BaselineClassifier() + data = Table({"feat": [0, 1], "target": [0, 1]}).to_tabular_dataset("target") + assert isinstance(model.fit(data), BaselineClassifier) + + def test_should_raise_if_is_fitted_is_set_correctly(self) -> None: + model = BaselineClassifier() + data = Table({"feat": [0, 1], "target": [0, 1]}).to_tabular_dataset("target") + assert not model.is_fitted + model = model.fit(data) + assert model.is_fitted + + def test_should_raise_if_model_not_fitted(self) -> None: + model = BaselineClassifier() + predict_data = Table({"feat": [0, 1], "target": [0, 1]}).to_tabular_dataset("target") + with pytest.raises(ModelNotFittedError): + model.predict(predict_data) + + def test_should_raise_if_predict_data_has_differing_features(self) -> None: + model = BaselineClassifier() + fit_data = Table({"feat": [0, 1], "target": [0, 1]}).to_tabular_dataset("target") + predict_data = Table({"other": [0, 1], "target": [0, 1]}).to_tabular_dataset("target") + model = model.fit(fit_data) + with pytest.raises(FeatureDataMismatchError): + model.predict(predict_data) + + def test_should_raise_if_predict_data_misses_target_column(self) -> None: + model = BaselineClassifier() + fit_data = Table({"feat": [0, 1], "target": [0, 1]}).to_tabular_dataset("target") + predict_data = Table({"feat": [0, 1], "other": [0, 1]}).to_tabular_dataset("other") + model = model.fit(fit_data) + with pytest.raises(TargetDataMismatchError): + model.predict(predict_data) + + def test_check_predict_return_type_and_values(self) -> None: + model = BaselineClassifier() + data = Table({"feat": [0, 1], "target": [0, 1]}).to_tabular_dataset("target") + model = model.fit(data) + result = model.predict(data) + assert isinstance(result, dict) + assert result.get("accuracy", 0.0) >= 0.0 + assert result.get("f1score", 0.0) >= 0.0 + assert result.get("precision", 0.0) >= 0.0 + assert result.get("recall", 0.0) >= 0.0 diff --git a/tests/safeds/ml/classical/regression/test_baseline_regressor.py b/tests/safeds/ml/classical/regression/test_baseline_regressor.py new file mode 100644 index 000000000..2d8816ef2 --- /dev/null +++ b/tests/safeds/ml/classical/regression/test_baseline_regressor.py @@ -0,0 +1,85 @@ +import pytest +from safeds.data.tabular.containers import Table +from safeds.exceptions import ( + ColumnTypeError, + DatasetMissesDataError, + FeatureDataMismatchError, + ModelNotFittedError, + TargetDataMismatchError, +) +from safeds.ml.classical.regression import BaselineRegressor + + +class TestBaselineRegressor: + def test_should_raise_if_fit_dataset_contains_no_data(self) -> None: + model = BaselineRegressor() + data = Table({"feat": [], "target": []}).to_tabular_dataset("target") + with pytest.raises(DatasetMissesDataError): + model.fit(data) + + def test_should_raise_if_predict_dataset_contains_no_data(self) -> None: + model = BaselineRegressor() + fit_data = Table({"feat": [0, 1], "target": [0, 1]}).to_tabular_dataset("target") + predict_data = Table({"feat": [], "target": []}).to_tabular_dataset("target") + model = model.fit(fit_data) + with pytest.raises(DatasetMissesDataError): + model.predict(predict_data) + + def test_should_raise_if_fit_dataset_contains_non_numerical_columns(self) -> None: + model = BaselineRegressor() + data = Table({"feat": ["a", "b"], "target": [0, 1]}).to_tabular_dataset("target") + with pytest.raises(ColumnTypeError): + model.fit(data) + + def test_should_raise_if_predict_dataset_contains_non_numerical_columns(self) -> None: + model = BaselineRegressor() + fit_data = Table({"feat": [0, 1], "target": [0, 1]}).to_tabular_dataset("target") + predict_data = Table({"feat": ["zero", "one"], "target": [0, 1]}).to_tabular_dataset("target") + model = model.fit(fit_data) + with pytest.raises(ColumnTypeError): + model.predict(predict_data) + + def test_should_check_that_fit_returns_baseline_classifier(self) -> None: + model = BaselineRegressor() + data = Table({"feat": [0, 1], "target": [0, 1]}).to_tabular_dataset("target") + assert isinstance(model.fit(data), BaselineRegressor) + + def test_should_raise_if_is_fitted_is_set_correctly(self) -> None: + model = BaselineRegressor() + data = Table({"feat": [0, 1], "target": [0, 1]}).to_tabular_dataset("target") + assert not model.is_fitted + model = model.fit(data) + assert model.is_fitted + + def test_should_raise_if_model_not_fitted(self) -> None: + model = BaselineRegressor() + predict_data = Table({"feat": [0, 1], "target": [0, 1]}).to_tabular_dataset("target") + with pytest.raises(ModelNotFittedError): + model.predict(predict_data) + + def test_should_raise_if_predict_data_has_differing_features(self) -> None: + model = BaselineRegressor() + fit_data = Table({"feat": [0, 1], "target": [0, 1]}).to_tabular_dataset("target") + predict_data = Table({"other": [0, 1], "target": [0, 1]}).to_tabular_dataset("target") + model = model.fit(fit_data) + with pytest.raises(FeatureDataMismatchError): + model.predict(predict_data) + + def test_should_raise_if_predict_data_misses_target_column(self) -> None: + model = BaselineRegressor() + fit_data = Table({"feat": [0, 1], "target": [0, 1]}).to_tabular_dataset("target") + predict_data = Table({"feat": [0, 1], "other": [0, 1]}).to_tabular_dataset("other") + model = model.fit(fit_data) + with pytest.raises(TargetDataMismatchError): + model.predict(predict_data) + + def test_check_predict_return_type_and_values(self) -> None: + model = BaselineRegressor() + data = Table({"feat": [0, 1], "target": [0, 1]}).to_tabular_dataset("target") + model = model.fit(data) + result = model.predict(data) + assert isinstance(result, dict) + assert result.get("coefficient_of_determination", float("-inf")) >= float("-inf") + assert result.get("mean_absolute_error", float("inf")) <= float("inf") + assert result.get("mean_squared_error", float("inf")) <= float("inf") + assert result.get("median_absolute_deviation", float("inf")) <= float("inf") diff --git a/tests/safeds/ml/nn/test_model.py b/tests/safeds/ml/nn/test_model.py index 0902d630d..43fc67aa6 100644 --- a/tests/safeds/ml/nn/test_model.py +++ b/tests/safeds/ml/nn/test_model.py @@ -212,7 +212,7 @@ def test_should_raise_if_test_features_mismatch(self, device: Device) -> None: ) with pytest.raises( FeatureDataMismatchError, - match="The features in the given table do not match with the specified feature columns names of the neural network.", + match="The features in the given table do not match with the specified feature columns names of the model.", ): model.predict( Table.from_dict({"a": [1], "c": [2]}), @@ -229,7 +229,7 @@ def test_should_raise_if_train_features_mismatch(self, device: Device) -> None: ) with pytest.raises( FeatureDataMismatchError, - match="The features in the given table do not match with the specified feature columns names of the neural network.", + match="The features in the given table do not match with the specified feature columns names of the model.", ): learned_model.fit(Table.from_dict({"k": [0.1, 0, 0.2], "l": [0, 0.15, 0.5]}).to_tabular_dataset("k")) @@ -636,7 +636,7 @@ def test_should_raise_if_test_features_mismatch(self, device: Device) -> None: ) with pytest.raises( FeatureDataMismatchError, - match="The features in the given table do not match with the specified feature columns names of the neural network.", + match="The features in the given table do not match with the specified feature columns names of the model.", ): model.predict( Table.from_dict({"a": [1], "c": [2]}), @@ -653,7 +653,7 @@ def test_should_raise_if_train_features_mismatch(self, device: Device) -> None: ) with pytest.raises( FeatureDataMismatchError, - match="The features in the given table do not match with the specified feature columns names of the neural network.", + match="The features in the given table do not match with the specified feature columns names of the model.", ): trained_model.fit( Table.from_dict({"k": [1, 0, 2], "l": [0, 15, 5]}).to_tabular_dataset("l"),