From 33c0b2c5515f64d2942e99cbf5913cfa0efbb231 Mon Sep 17 00:00:00 2001 From: Sefik Ilkin Serengil Date: Mon, 25 Dec 2023 17:43:24 +0000 Subject: [PATCH] modular unit tests --- .github/workflows/tests.yml | 2 +- Makefile | 2 +- chefboost/Chefboost.py | 81 +++-- chefboost/commons/evaluate.py | 90 +++-- chefboost/training/Training.py | 18 +- chefboost/tuning/adaboost.py | 11 +- chefboost/tuning/gbm.py | 53 ++- chefboost/tuning/randomforest.py | 33 +- .../test_adaboost.cpython-38-pytest-7.1.2.pyc | Bin 0 -> 1691 bytes .../test_c45.cpython-38-pytest-7.1.2.pyc | Bin 0 -> 1906 bytes .../test_cart.cpython-38-pytest-7.1.2.pyc | Bin 0 -> 1910 bytes .../test_chaid.cpython-38-pytest-7.1.2.pyc | Bin 0 -> 1975 bytes .../test_gbm.cpython-38-pytest-7.1.2.pyc | Bin 0 -> 2791 bytes .../test_id3.cpython-38-pytest-7.1.2.pyc | Bin 0 -> 9261 bytes ...t_randomforest.cpython-38-pytest-7.1.2.pyc | Bin 0 -> 3370 bytes ...est_regression.cpython-38-pytest-7.1.2.pyc | Bin 0 -> 1752 bytes tests/global-unit-test.py | 339 ------------------ tests/test_adaboost.py | 27 ++ tests/test_c45.py | 24 ++ tests/test_cart.py | 25 ++ tests/test_chaid.py | 26 ++ tests/test_gbm.py | 48 +++ tests/test_id3.py | 114 ++++++ tests/test_randomforest.py | 55 +++ tests/test_regression.py | 27 ++ 25 files changed, 549 insertions(+), 426 deletions(-) create mode 100644 tests/__pycache__/test_adaboost.cpython-38-pytest-7.1.2.pyc create mode 100644 tests/__pycache__/test_c45.cpython-38-pytest-7.1.2.pyc create mode 100644 tests/__pycache__/test_cart.cpython-38-pytest-7.1.2.pyc create mode 100644 tests/__pycache__/test_chaid.cpython-38-pytest-7.1.2.pyc create mode 100644 tests/__pycache__/test_gbm.cpython-38-pytest-7.1.2.pyc create mode 100644 tests/__pycache__/test_id3.cpython-38-pytest-7.1.2.pyc create mode 100644 tests/__pycache__/test_randomforest.cpython-38-pytest-7.1.2.pyc create mode 100644 tests/__pycache__/test_regression.cpython-38-pytest-7.1.2.pyc delete mode 100644 tests/global-unit-test.py create mode 100644 tests/test_adaboost.py create mode 100644 tests/test_c45.py create mode 100644 tests/test_cart.py create mode 100644 tests/test_chaid.py create mode 100644 tests/test_gbm.py create mode 100644 tests/test_id3.py create mode 100644 tests/test_randomforest.py create mode 100644 tests/test_regression.py diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 5a5350b..d7cf83a 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -40,7 +40,7 @@ jobs: - name: Test with pytest run: | cd tests - python global-unit-test.py + python -m pytest . -s --disable-warnings linting: needs: unit-tests diff --git a/Makefile b/Makefile index ab7f41a..168bbb7 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ test: - cd tests && python global-unit-test.py + cd tests && python -m pytest . -s --disable-warnings lint: python -m pylint chefboost/ --fail-under=10 \ No newline at end of file diff --git a/chefboost/Chefboost.py b/chefboost/Chefboost.py index bd9ef1e..9cf0e1a 100644 --- a/chefboost/Chefboost.py +++ b/chefboost/Chefboost.py @@ -24,6 +24,7 @@ def fit( config: Optional[dict] = None, target_label: str = "Decision", validation_df: Optional[pd.DataFrame] = None, + silent: bool = False, ) -> Dict[str, Any]: """ Build (a) decision tree model(s) @@ -55,6 +56,9 @@ def fit( if nothing is passed to validation data frame, then the function validates built trees for training data frame + silent (bool): set this to True if you do not want to see + any informative logs + Returns: chefboost model """ @@ -139,7 +143,8 @@ def fit( if enableParallelism == True: num_cores = config["num_cores"] - logger.info(f"[INFO]: {num_cores} CPU cores will be allocated in parallel running") + if silent is False: + logger.info(f"[INFO]: {num_cores} CPU cores will be allocated in parallel running") from multiprocessing import set_start_method, freeze_support @@ -169,7 +174,8 @@ def fit( config["algorithm"] = "Regression" if enableGBM == True: - logger.info("Gradient Boosting Machines...") + if silent is False: + logger.info("Gradient Boosting Machines...") algorithm = "Regression" config["algorithm"] = "Regression" @@ -184,7 +190,8 @@ def fit( # ------------------------- - logger.info(f"{algorithm} tree is going to be built...") + if silent is False: + logger.info(f"{algorithm} tree is going to be built...") # initialize a dictionary. this is going to be used to check features numeric or nominal. # numeric features should be transformed to nominal values based on scales. @@ -212,7 +219,13 @@ def fit( if enableAdaboost == True: trees, alphas = adaboost_clf.apply( - df, config, header, dataset_features, validation_df=validation_df, process_id=process_id + df, + config, + header, + dataset_features, + validation_df=validation_df, + process_id=process_id, + silent=silent, ) elif enableGBM == True: @@ -224,6 +237,7 @@ def fit( dataset_features, validation_df=validation_df, process_id=process_id, + silent=silent, ) # classification = True @@ -235,12 +249,19 @@ def fit( dataset_features, validation_df=validation_df, process_id=process_id, + silent=silent, ) # classification = False elif enableRandomForest == True: trees = randomforest.apply( - df, config, header, dataset_features, validation_df=validation_df, process_id=process_id + df, + config, + header, + dataset_features, + validation_df=validation_df, + process_id=process_id, + silent=silent, ) else: # regular decision tree building root = 1 @@ -264,8 +285,9 @@ def fit( main_process_id=process_id, ) - logger.info("-------------------------") - logger.info(f"finished in {time.time() - begin} seconds") + if silent is False: + logger.info("-------------------------") + logger.info(f"finished in {time.time() - begin} seconds") obj = {"trees": trees, "alphas": alphas, "config": config, "nan_values": nan_values} @@ -273,13 +295,13 @@ def fit( # train set accuracy df = base_df.copy() - evaluate(obj, df, task="train") + trainset_evaluation = evaluate(obj, df, task="train", silent=silent) + obj["evaluation"] = {"train": trainset_evaluation} # validation set accuracy if isinstance(validation_df, pd.DataFrame): - evaluate(obj, validation_df, task="validation") - - # ----------------------------------------- + validationset_evaluation = evaluate(obj, validation_df, task="validation", silent=silent) + obj["evaluation"]["validation"] = validationset_evaluation return obj @@ -455,31 +477,38 @@ def restoreTree(module_name) -> Any: return functions.restoreTree(module_name) -def feature_importance(rules: Union[str, list]) -> pd.DataFrame: +def feature_importance(rules: Union[str, list], silent: bool = False) -> pd.DataFrame: """ Show the feature importance values of a built model Args: - rules (str or list): e.g. decision_rules = "outputs/rules/rules.py" + rules (str or list): e.g. decision_rules = "outputs/rules/rules.py" or this could be retrieved from built model as shown below. - decision_rules = [] - for tree in model["trees"]: - rule = .__dict__["__spec__"].origin - decision_rules.append(rule) + ```python + decision_rules = [] + for tree in model["trees"]: + rule = .__dict__["__spec__"].origin + decision_rules.append(rule) + ``` + silent (bool): set this to True if you do want to see + any informative logs. Returns: feature importance (pd.DataFrame) """ if not isinstance(rules, list): rules = [rules] - logger.info(f"rules: {rules}") + + if silent is False: + logger.info(f"rules: {rules}") # ----------------------------- dfs = [] for rule in rules: - logger.info("Decision rule: {rule}") + if silent is False: + logger.info(f"Decision rule: {rule}") with open(rule, "r", encoding="UTF-8") as file: lines = file.readlines() @@ -564,8 +593,12 @@ def feature_importance(rules: Union[str, list]) -> pd.DataFrame: def evaluate( - model: dict, df: pd.DataFrame, target_label: str = "Decision", task: str = "test" -) -> None: + model: dict, + df: pd.DataFrame, + target_label: str = "Decision", + task: str = "test", + silent: bool = False, +) -> dict: """ Evaluate the performance of a built model on a data set Args: @@ -573,8 +606,10 @@ def evaluate( df (pandas data frame): data frame you would like to evaluate target_label (str): target label task (string): set this to train, validation or test + silent (bool): set this to True if you do not want to see + any informative logs Returns: - None + evaluation results (dict) """ # -------------------------- @@ -598,4 +633,4 @@ def evaluate( df["Decision"] = df["Decision"].astype(str) df["Prediction"] = df["Prediction"].astype(str) - cb_eval.evaluate(df, task=task) + return cb_eval.evaluate(df, task=task, silent=silent) diff --git a/chefboost/commons/evaluate.py b/chefboost/commons/evaluate.py index 44eba39..2cb480d 100644 --- a/chefboost/commons/evaluate.py +++ b/chefboost/commons/evaluate.py @@ -1,4 +1,5 @@ import math +import pandas as pd from chefboost.commons.logger import Logger # pylint: disable=broad-except @@ -6,25 +7,38 @@ logger = Logger(module="chefboost/commons/evaluate.py") -def evaluate(df, task="train"): +def evaluate(df: pd.DataFrame, task: str = "train", silent: bool = False) -> dict: + """ + Evaluate results + Args: + df (pd.DataFrame): data frame + task (str): train, test + silent (bool): set this to True if you do not want to + see any informative logs + Returns: + evaluation results (dict) + """ if df["Decision"].dtypes == "object": problem_type = "classification" else: problem_type = "regression" - # ------------------------------------- - + evaluation_results = {} instances = df.shape[0] - logger.info("-------------------------") - logger.info(f"Evaluate {task} set") - logger.info("-------------------------") + if silent is False: + logger.info("-------------------------") + logger.info(f"Evaluate {task} set") + logger.info("-------------------------") if problem_type == "classification": idx = df[df["Prediction"] == df["Decision"]].index accuracy = 100 * len(idx) / df.shape[0] - logger.info(f"Accuracy: {accuracy}% on {instances} instances") + if silent is False: + logger.info(f"Accuracy: {accuracy}% on {instances} instances") + evaluation_results["Accuracy"] = accuracy + evaluation_results["Instances"] = instances # ----------------------------- predictions = df.Prediction.values @@ -48,8 +62,12 @@ def evaluate(df, task="train"): confusion_row.append(item) confusion_matrix.append(confusion_row) - logger.info(f"Labels: {labels}") - logger.info(f"Confusion matrix: {confusion_matrix}") + if silent is False: + logger.info(f"Labels: {labels}") + logger.info(f"Confusion matrix: {confusion_matrix}") + + evaluation_results["Labels"] = labels + evaluation_results["Confusion matrix"] = confusion_matrix # ----------------------------- # precision and recall @@ -79,11 +97,19 @@ def evaluate(df, task="train"): accuracy = round(100 * (tp + tn) / (tp + tn + fp + fn + epsilon), 4) if len(labels) >= 3: - logger.info(f"Decision {decision_class}") - logger.info(f"Accuray: {accuracy}") + if silent is False: + logger.info(f"Decision {decision_class}") + logger.info(f"Accuracy: {accuracy}") + + evaluation_results[f"Decision {decision_class}'s Accuracy"] = accuracy - logger.info(f"Precision: {precision}%, Recall: {recall}%, F1: {f1_score}%") - logger.debug(f"TP: {tp}, TN: {tn}, FP: {fp}, FN: {fn}") + if silent is False: + logger.info(f"Precision: {precision}%, Recall: {recall}%, F1: {f1_score}%") + logger.debug(f"TP: {tp}, TN: {tn}, FP: {fp}, FN: {fn}") + + evaluation_results["Precision"] = precision + evaluation_results["Recall"] = recall + evaluation_results["F1"] = f1_score if len(labels) < 3: break @@ -99,13 +125,17 @@ def evaluate(df, task="train"): if instances > 0: mae = df["Absolute_Error"].sum() / instances - logger.info(f"MAE: {mae}") - mse = df["Absolute_Error_Squared"].sum() / instances - logger.info(f"MSE: {mse}") - rmse = math.sqrt(mse) - logger.info(f"RMSE: {rmse}") + + evaluation_results["MAE"] = mae + evaluation_results["MSE"] = mse + evaluation_results["RMSE"] = rmse + + if silent is False: + logger.info(f"MAE: {mae}") + logger.info(f"MSE: {mse}") + logger.info(f"RMSE: {rmse}") rae = 0 rrse = 0 @@ -122,12 +152,26 @@ def evaluate(df, task="train"): except Exception as err: logger.error(str(err)) - logger.info(f"RAE: {rae}") - logger.info(f"RRSE {rrse}") + if silent is False: + logger.info(f"RAE: {rae}") + logger.info(f"RRSE {rrse}") + + evaluation_results["RAE"] = rae + evaluation_results["RRSE"] = rrse mean = df["Decision"].mean() - logger.info(f"Mean: {mean}") + + if silent is False: + logger.info(f"Mean: {mean}") + + evaluation_results["Mean"] = mean if mean > 0: - logger.info(f"MAE / Mean: {100 * mae / mean}%") - logger.info(f"RMSE / Mean: {100 * rmse / mean}%") + if silent is False: + logger.info(f"MAE / Mean: {100 * mae / mean}%") + logger.info(f"RMSE / Mean: {100 * rmse / mean}%") + + evaluation_results["MAE / Mean"] = 100 * mae / mean + evaluation_results["RMSE / Mean"] = 100 * rmse / mean + + return evaluation_results diff --git a/chefboost/training/Training.py b/chefboost/training/Training.py index ee37ca7..b7c6e06 100644 --- a/chefboost/training/Training.py +++ b/chefboost/training/Training.py @@ -510,16 +510,10 @@ def buildDecisionTree( # add else condition in the decision tree if df.Decision.dtypes == "object": # classification - pivot = pd.DataFrame(subdataset.Decision.value_counts()).reset_index() - - if pd.__version__.split(".")[0] == "1": - pivot = pivot.rename(columns={"Decision": "Instances", "index": "Decision"}) - else: # if pd.__version__.split(".")[0] == "2": - pivot = pivot.rename(columns={"Decision": "Instances", "count": "Decision"}) - - pivot = pivot.sort_values(by=["Instances"], ascending=False).reset_index() - - else_decision = f"return '{pivot.iloc[0].Decision}'" + pivot = pd.DataFrame(subdataset.Decision.value_counts()).sort_values( + by=["count"], ascending=False + ) + else_decision = f"return '{str(pivot.iloc[0].name)}'" if enableParallelism != True: functions.storeRule(file, (functions.formatRule(root), "else:")) @@ -669,7 +663,7 @@ def buildDecisionTree( # this is reguler decision tree. find accuracy here. module_name = "outputs/rules/rules" - myrules = load_module(module_name) # rules0 + myrules = load_module(module_name) # rules0 models.append(myrules) return models @@ -682,7 +676,7 @@ def findPrediction(row): params.append(row[j]) module_name = "outputs/rules/rules" - myrules = load_module(module_name) # rules0 + myrules = load_module(module_name) # rules0 prediction = myrules.findDecision(params) return prediction diff --git a/chefboost/tuning/adaboost.py b/chefboost/tuning/adaboost.py index cda488e..3623347 100644 --- a/chefboost/tuning/adaboost.py +++ b/chefboost/tuning/adaboost.py @@ -31,7 +31,9 @@ def findPrediction(row): return prediction -def apply(df, config, header, dataset_features, validation_df=None, process_id=None): +def apply( + df, config, header, dataset_features, validation_df=None, process_id=None, silent: bool = False +): models = [] alphas = [] @@ -53,8 +55,7 @@ def apply(df, config, header, dataset_features, validation_df=None, process_id=N best_epoch_idx = 0 best_epoch_value = 1000000 - # for i in range(0, num_of_weak_classifier): - pbar = tqdm(range(0, num_of_weak_classifier), desc="Adaboosting") + pbar = tqdm(range(0, num_of_weak_classifier), desc="Adaboosting", disable=silent) for i in pbar: worksheet["Decision"] = worksheet["Weight"] * worksheet["Decision"] @@ -139,8 +140,8 @@ def apply(df, config, header, dataset_features, validation_df=None, process_id=N pbar.set_description(f"Epoch {i + 1}. Loss: {mae}. Process: ") # ------------------------------ - - logger.info(f"The best epoch is {best_epoch_idx} with the {best_epoch_value} MAE score") + if silent is False: + logger.info(f"The best epoch is {best_epoch_idx} with the {best_epoch_value} MAE score") models = models[0 : best_epoch_idx + 1] alphas = alphas[0 : best_epoch_idx + 1] diff --git a/chefboost/tuning/gbm.py b/chefboost/tuning/gbm.py index 95bb17f..85ac99b 100644 --- a/chefboost/tuning/gbm.py +++ b/chefboost/tuning/gbm.py @@ -1,4 +1,5 @@ import gc +from typing import Optional, Union import pandas as pd import numpy as np @@ -14,7 +15,7 @@ logger = Logger(module="chefboost/tuning/gbm.py") -def findPrediction(row): +def findPrediction(row: pd.Series) -> Union[str, float]: epoch = row["Epoch"] row = row.drop(labels=["Epoch"]) columns = row.shape[0] @@ -32,7 +33,15 @@ def findPrediction(row): return prediction -def regressor(df, config, header, dataset_features, validation_df=None, process_id=None): +def regressor( + df: pd.DataFrame, + config: dict, + header: str, + dataset_features: dict, + validation_df: Optional[pd.DataFrame] = None, + process_id: Optional[int] = None, + silent: bool = False, +) -> list: models = [] # we will update decisions in every epoch, this will be used to restore @@ -69,10 +78,7 @@ def regressor(df, config, header, dataset_features, validation_df=None, process_ best_epoch_idx = 0 best_epoch_loss = 1000000 - pbar = tqdm(range(1, epochs + 1), desc="Boosting") - - # for index in range(1,epochs+1): - # for index in tqdm(range(1,epochs+1), desc='Boosting'): + pbar = tqdm(range(1, epochs + 1), desc="Boosting", disable=silent) for index in pbar: logger.debug(f"epoch {index} - ") loss = 0 @@ -155,22 +161,33 @@ def regressor(df, config, header, dataset_features, validation_df=None, process_ # --------------------------------- - logger.info(f"The best epoch is {best_epoch_idx} with {best_epoch_loss} loss value") + if silent is False: + logger.info(f"The best epoch is {best_epoch_idx} with {best_epoch_loss} loss value") models = models[0:best_epoch_idx] config["epochs"] = best_epoch_idx - logger.info( - f"MSE of {num_of_instances} instances are boosted from {boosted_from}" - f"to {best_epoch_loss} in {epochs} epochs" - ) + if silent is False: + logger.info( + f"MSE of {num_of_instances} instances are boosted from {boosted_from}" + f"to {best_epoch_loss} in {epochs} epochs" + ) return models -def classifier(df, config, header, dataset_features, validation_df=None, process_id=None): +def classifier( + df: pd.DataFrame, + config: dict, + header: str, + dataset_features: dict, + validation_df: Optional[pd.DataFrame] = None, + process_id: Optional[int] = None, + silent: bool = False, +) -> tuple: models = [] - logger.info("gradient boosting for classification") + if silent is False: + logger.info("gradient boosting for classification") epochs = config["epochs"] enableParallelism = config["enableParallelism"] @@ -182,7 +199,7 @@ def classifier(df, config, header, dataset_features, validation_df=None, process boosted_predictions = np.zeros([df.shape[0], len(classes)]) - pbar = tqdm(range(0, epochs), desc="Boosting") + pbar = tqdm(range(0, epochs), desc="Boosting", disable=silent) # store actual set, we will use this to calculate loss actual_set = pd.DataFrame(np.zeros([df.shape[0], len(classes)]), columns=classes) @@ -317,9 +334,11 @@ def classifier(df, config, header, dataset_features, validation_df=None, process # -------------------------------- - logger.info( - f"The best accuracy got in {best_accuracy_idx} epoch with the score {best_accuracy_value}" - ) + if silent is False: + logger.info( + f"The best accuracy got in {best_accuracy_idx} epoch" + f" with the score {best_accuracy_value}" + ) models = models[0 : best_accuracy_idx * len(classes) + len(classes)] diff --git a/chefboost/tuning/randomforest.py b/chefboost/tuning/randomforest.py index d6dfe6e..a8a1c40 100644 --- a/chefboost/tuning/randomforest.py +++ b/chefboost/tuning/randomforest.py @@ -1,7 +1,9 @@ +from typing import Optional import multiprocessing from contextlib import closing from tqdm import tqdm +import pandas as pd from chefboost.commons import functions from chefboost.training import Training @@ -10,7 +12,15 @@ # pylint: disable=unused-argument -def apply(df, config, header, dataset_features, validation_df=None, process_id=None): +def apply( + df: pd.DataFrame, + config: dict, + header: str, + dataset_features: dict, + validation_df: Optional[pd.DataFrame] = None, + process_id: Optional[int] = None, + silent: bool = False, +): models = [] num_of_trees = config["num_of_trees"] @@ -24,9 +34,10 @@ def apply(df, config, header, dataset_features, validation_df=None, process_id=N input_params = [] - pbar = tqdm(range(0, num_of_trees), desc="Bagging") + pbar = tqdm(range(0, num_of_trees), desc="Bagging", disable=silent) for i in pbar: - pbar.set_description(f"Sub decision tree {i + 1} is processing") + if silent is False: + pbar.set_description(f"Sub decision tree {i + 1} is processing") subset = df.sample(frac=1 / num_of_trees) root = 1 @@ -38,7 +49,19 @@ def apply(df, config, header, dataset_features, validation_df=None, process_id=N if parallelism_on: # parallel run input_params.append( - (subset, root, file, config, dataset_features, 0, 0, "root", i, None, process_id) + ( + subset, + root, + file, + config, + dataset_features, + 0, + 0, + "root", + i, + None, + process_id, + ) ) else: # serial run @@ -75,7 +98,7 @@ def apply(df, config, header, dataset_features, validation_df=None, process_id=N # all functions registered here # results = [] - for f in tqdm(funclist): + for f in tqdm(funclist, disable=silent): _ = f.get(timeout=100000) # this was branch_results # results.append(branch_results) diff --git a/tests/__pycache__/test_adaboost.cpython-38-pytest-7.1.2.pyc b/tests/__pycache__/test_adaboost.cpython-38-pytest-7.1.2.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5469e37b860b4bb1be9c7a21f94cf362a8444a42 GIT binary patch literal 1691 zcmZWqPjA~c6elTJmSxG#zh((`@j48M4vX6)Op=rBPsDMz#Z7$nm+ znRHpVVV45=06Eyl?X(ZDPr~E2OTI#}!yaWPa5ox!djI&|kMEK63(s>9jK6>Xh5bel z`b!Io!vNtJrg#p7B8pQK;QC84gCLADGiczM#R{5WA*r3U0;kp)X*+WR7bE6UlQ!Ps zz@rvz{)mDO#a|=ezJ{pinU7WL$tjCZib6;btS^hXO_4`B>}(YTeC8# zOr5$7HYJE(mCiSE}X-(0{ z5V32CZOujb^1ZssS$SP2)qA5*annd<;w14iEd1>ZalPh^I710 z#fF>-kreqi*ZP<}^-a}^(qX|9d71?smPaQkdpz%Ku%4e~VG)O~STqU;X#^|dgz;c` z=J!0}QJS(e5!uBWjbtR4?B5;>`C9r!c`u_h0UM+YX{;=fq%4=$CL|xigy9}v!_h!K zen$=ueSER{$=3M%o-cZbhrLfVzv~O-j5(vpK!<^#1~>#jb}u@BiE-J}e*3=A%Ke}r zIb#9>;vJ{zK66< zj5v`&bG}dI9FNau*Nm&>Z~%1Lk_%;}1$YRx5sK5|ETy$; zI82L^C=K%{W2zO3C}ui>>O8JX29^AR^Mb3@FfMo&$&kGsr%_%5=3Us64y$jyGLt+m z{3T@#iabt+iqM$rd#xJvPL&N9WRwpWcXiF$8r!TdJBz~JqHupE%u+vC6b=^Mb{B>H zMd7_Q4}-cOvuOA4{fhWUfaKkS{tGxAqAysSjQXE5F_Oi&Ke&0=>UZq_ubJ}Rd*5t` zBpw1mgy8!3k7YU7CM(1yE-{gXY=1z qd)7s+|2X)EnrJ+AffmSJ&?)<~*(-jeFQm}Tw9yK(v5Q^e82*R!W+bFdZ{@Z{D9N3$L)(ZOb|GSAUfv7)jnB?@azFGb5k7Z# z=VuxYxXZggkZ{O7-h=)I@ADn#cRBrnjQu}g74m3IP4DEXNKVUAX)xS}4IqME4zkWayzBXp6ms!DR=k5 zw@((}U)Z%(+h^9osU7vLb}l=-TRREm9{Lk&>449zttD-Gdr5ikg>{=OI(U}4lTsMF zx(iXSc9-;f`W-Olz_+0}3tRUt``lkrWC+eYcyzJLnPy7p>8#9>iGHq4k7ct`ruu1q z<(wQ&jvlWp_T@;b|Uv@PqqsB77Q zBS{zFvBdk?Xjozel0L!!VTf=8VHaV9@FoK0rG>X)WbVI|?}p}9w6EeG7kL%;??C)@ z=6+LBVkRe;hyO?Y`~x`uOG-nAH$GVB_3w-S?@8KglGMBwtzWzKAJo4K@$txbSe0~I zDC6Um-dOulTS_MFC5Lh_k>VSul~6`DF4JB6B(4^^707}qxd(>nZ7#92z?L_0#v2>w z1d38#7HZPg7-{32eS`o3g&}c^TXJlK*=W+c-~Nq0p)f5>f4^{xqkun C_~(rP literal 0 HcmV?d00001 diff --git a/tests/__pycache__/test_cart.cpython-38-pytest-7.1.2.pyc b/tests/__pycache__/test_cart.cpython-38-pytest-7.1.2.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8a45a5001340cfd7a344f79416964a7bca645190 GIT binary patch literal 1910 zcmdT_Pmj|^6rZsj$H_X&?m`iJsOk$UX;H$`g>~)Rv#OT+c4Ba08Thf$dFdEmD(1eBz1-jT1VVr6CzgPrLAGR^4ZB+>JNQNgwGw` z_?r$p+~v&|WVp^f-h%lYZ}T;nH#ogdy8eIgE9736n%3SU5gp{Y(qOoE^YK{7<1HCWgM^mm}lfxTfkQ(L!A+T5Q}qzEqExp%z5nPy7p{y0yfo_?lHizVY+#`;le zoV`20?(fenWjx3xsgUs~hy>FUDOBxT ze4v>e3msrW0-k4LzGkd~8&3+x!;$*USfc}DN3k~T+r?3pHzv&{Bri!qDGE8t(}GE1 zTA^Z*K#y6!QynzUvtOjlr8ytMky55Z@vKN##xz8Xn^cN0PMk&gylE^RnFi=UBxVhM zDXcGhZPoZ;)wo_8_0Bg|jUVSkehajm?(`o3VXCi06d(3~7V1#vMSpZA%SwgsYtig1 zm5FB#7c_=v)P=z3(pgi7DO48$2yN2~)CZM#Fs>4xtP`KgE=0a$VQ;Pq`#LIHHsDIq z1=yEZp_PgyHX-RFbPzTWHW9WE&Lez-fSIY`3cQ)YhY7EEu66q<^3TB*pCbP|h=0lC zFS`m9b`KNrdE8I%{wv*Kw|ArJ4ha^0ut@B?+W&Z#sQZLjIM FZvm-v@;Lwi literal 0 HcmV?d00001 diff --git a/tests/__pycache__/test_chaid.cpython-38-pytest-7.1.2.pyc b/tests/__pycache__/test_chaid.cpython-38-pytest-7.1.2.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1a9c50c626cf7b58a096d0e7600f95e7b1467c94 GIT binary patch literal 1975 zcmdT_&2Jk;6rb5$@2;J78mAu+38B3JH9~e9l5%NKqZA1w=!N#;%QD%Sb+*|rX2xw3 z*%IPI;=(_GdWetye>lw%PW%fw@!oE1e1Hg!U1{FTd(ZRre!uzoS-0CE@cs7f&w@IH z{DIE;XM^(yn)(I|C!A(vMC;zlZHrKnJEInDGVaKOjFtI$d(^38cD9iRqks|-aEG^k zp`$K$x%VR(ZE~Ntp})a9d;|I|PQNDo;7{0vJn2)@J~|P}aak%2j{9Xf9t(Ni(?V%A zL>t9>`-AEXlX=M}nTT=b^7CMT?FgFs6^teql+!t#5p6xUr*KMcr?ftkJ)={~-5rSS zkQu~h_SBl%r`F7wI_f*^T(r11brQ;bj3?IIftWkB=CtnZIpyuw)*Uiy;a#S!l)~86 z9%TKgJEuR;@1bH2tPRbX*}8qv;lZ3@h2YNJC+Az7X{Ll8j>{|==vUgbSvD?Zs!wv$ zI{N(Zi^oqFmT}{ED`2F=y;qfHRco?6sl!r+v!=lX7f@NVMn4UGaZRnRmKXYA!FRErh#$NA}JT1v3O!yfWFAg2J93yH@vs* ze7x@LHcqqh{<`zYg2?xw7MHujXHX?IR3b@FhmVCi)nzq|mt3r=wG>vY~lKFbzdNxER3 zN_>|!btNi~4A6AZY@z9anS2H2I$G9joa92L@qeSJ+{U%vMT0Om<^vd+hyMs(jajq7 zbwJ*L6|Mua5BWD4$dxX|Ob!s#|B2&=aQ`(N_Xhj5x@0II;T1{#){%b)X}bogo)%qA z!{>8d>2d3R&-kcpIxUn5P$5>{PShyVpwSFamw^;7L2yBpRw3hE#pGMCnsFVi5;R6p$8EsNhT55>XmOW| znUNEz=9w|AO?|15ZJ6QV+c~2#lcUdqYW~!o@tknfKVlMNW+BOp|xB-KKq-ulSUHKL8o&?1$@4jUlR zQ)lFco?0`~=Ex6yqA{PE)Ot+97PV>PcUst{4t3$(q2#_6c!ywJyBCnseP@deH?vGg znAkth;+XNhu4F=r6{P)mbF@6!Md2u;J1HCB&f{ku7zf5XFvL9_A|_O)2KwGN3r%VV zgxXV5kP}Xq`a>fpxt8mo*?4RgMs84NQKRmnR+y9&R&M6jwqDq|E&eFY1B-gOJtVY= z>qC8N!kV3%Q=;D1lu&$i(W4sN)0ggr-Y;3qnGi`f{_qfM=a2w-YL;%4 z#u-oK)+lt@IND6v&))tZwAmyZY>BXyvWSn9aop#TWP4p2$w)A{5@+dfeOZ1gpXz1n zQIsYi0MmUsENzjbY%EKAkd22)%<*DBCC~KRw*#`*d-dXEcP$W0w{I^YyB>%@FAZ2j zvcA`jgkW4M*|k7~uy`%BBxg(nRMU^BIT;Enh$7?q=ogW@z2t8IN}lAE1``YZh%EE&_}0Z-M)tqK~VAsN4fn zbFcj4=`Jbf7zu`QOU-UzDAA@Uy$NSD8DPFjTSh!)GO+k8_~0ERbLbetyNcw-UZ<9l zH8^BwLbg_ePN`35>2Ma&{y;n`^}%Lo3=>&;8umC0C?cA9}%Q7-hw zR<@H;WxpS%*=CgX$I*zDjiV#-1=trCOT#7_tq!+ZpIoU=E|*HJ z3hZhnsV3JeNo8QIzUNwfa%#_wP@SYQu(nz{Ll(&$s0)4;0^|6d>wDMhlk3%_+I#bq zp|z(PzXW@qt*$(P>nm0S8z$Q;cbV9h*<@vK+#6IYIMoo!+2ig2Njxr4tQhe=5KY&K z`h90NJmTp+;sXRbFn+B0T3hppqh^j~Yrj$}GrCVqQeXdzTIVZZ?NrafFVTRZdd|Or z`C-p_2t9{3u-{RSHe-wWkM)A&6sIa&y@!B z2L2DF0UT%m+N=KM&;`1fI|>_J>4QcK`$^Tmpt~U+p1l_u@7=xfA-CY%+(v?__#WO8 zo`_|{x;_6sn?&gnKu9dN_L|k~lZ3*={{@pZQ8iPd(7DS731TfYdIQlR zAc!5OPz?Z9;a$+?&x4Gg15%oUY_iMGyK2}OamT~8EClxSAp_r3lGEm5a;R*Co zuL2Y5!T+CaI(qNqrkmQw)g8YAy1`8890Qd)NV(Xr@Nl_83iv3?oP7cv53ulLscamr zsWgQ4BpTC5sFaoNN&Ry8Yo@8Zg$pN(%Ydw-Y%G>j^{c{_A|KDF<{Z((~^R|M&KmGIrw^dM-e^4R$r=##9e&Rg{rZBam)Kz)Zy1J&QO4q2H zYS@yhr_rKytZt@m%R0T2?dIw^RdI97VCIKvJ3{ZSY$TK;#guimdCNo z3ap4@g_T$t$0=4}Q#ejD^@38Hz6mXq^EEZdoLO*Nm->C-p^$p5KR@sCm9pmwPaGua zwAqo_!8K~^_StgBZPJ~izcfzhcLqQ4a|lmaRhhb`4i!&Z)O}dN^bfV6>Z{DytuS-9 zGSpaVsQa3)FKI)=H^ho(tePzC8!eStRNvCp4AiB3ZB3O&eNAPV8`@4~Xp)t_*;a9$ z^3rIv{M4HIzWPJ-rGaPTXAE^Ovud;4no4~KYxC!Cl~}`T2-iC}-|w_$y{q2Ebyy_Y z9k=HNsb;^|YR?CmMrXdy+ulMq&|f)wWL>?bz3{@ix>9-Sz~I_kO;lfap-QJmYhqms zbkvbJdaKY7!sVW9J6;o1d3<>fdgJdmeq0q^qsJPYRnd`E>d`9e_uN`G&;~59xZ7Y( zQ(O+T=A}SywY|W8ad1s`e1ivNr`hOq9PSQyv)>&wxEo{~(P+6;6BJ&OuWk2xU*o*b zgDI!g=iP?qxK{_AMz7(aB}jGTCk>2tuhm~q2O4VyCcdQG39@*T6RLhFSvZm`91aWN zlaD0}C)O4IG>p7;=-`_$UmO%}tG#satSgqh{@`G9)FXsyS z-0Ah5@H<%KOgkYI)m95?4jQx>6qKCG51?ht3+dVne$oryPK|lt$RE|vAK{1O zk@`^cwb&zdc%y9B#8^>^(gy!=ux2- z(t}NT_G)%bSxSk2qdn(aYbyV*)Cy-k4ef-T%!a12Y)9p6bObus^oOI|296)SVF1S!R%MU*6^Y}i2**l+iVNTKi+VO!vxyPb3x{n9f7^8ouLi%?KRJw7=?TEnixa*3*avU*mn~uXl zPn4Sm#PTr+Vu&#@BpL+$JeT(xo$AI%sSa%v!bDSUrf;`{u1(lZD@Rqsjw7Y{omtU3dqc`q{sO{vB(s_+UmBeEt9w+exi6==s zMPd#j$v5Ycg@|v4PIe+RAuP;|G1gOIQ%F+h4v&?NCLcIfIG$v)C&z9&6ckwG!4Aiy zN7T($o8V1^Hx$p1;?LfR$d10Cv+)ISD|wFiJp{2@TPtZ3yjII;DX?5au_fP4DJ{u& zXVmf9#rvqW!J|{k=m`eSQ?jNmpIK6GtI?;mf;!SC;@=wo9&8xn-{gle|NaDl@gbQ7 zj<|msBS*$p{dC0riGkZ-;JKmB@{)n;5d$wwFz{F9NK%wMqZIRuvX@=WNe2EOwC5!Q z|5T1E;rLgU=OYF_nls#qfmb31KERe(h5JlSdjD9 zaR$C1Z3!9p?%R<8{#wBmp)CgA_xakw_F#T*Ef!-2u1N-tY!etb>1DFLV1+8%hkVlp z14q7z#n^@gH18W(@VDD6Nb=kL5d(jOJ$e@gF0)e+1Fzmi!{6SchNie2YiM#pz=(l| z5$L`&^h&E`_E^k#ACDNX86=10U!=2H5{DpybVM=vVX8Vxg6I%GPT~ZK zlO#@&_!5aPlX#xQX%b%{@l_I(^xEN93Y>1hS5CuHGIRUU9X= zcEvGf=KqN?D;b&1o*gW80`tOhuXnAkpX+;d<6L`wAu!%@8%u$9zCTLiV^Tay=5*TM|qeKLe;x5|V(N0hmhEBEeLS6Ziqz!vy|6C6JQS2Le(>0?B4H zJ+LK^vJsGw#*cxN0g&>mNaM#q+IfDZoLulaELXg?H*`19_c!IBa|vghz6)n0q}R0Q zY@UQf1`T|K47sEu6B_WJYOmjIQziooVAbAo*X8YIHA)+Zurj*9YjC6}5(p}83qgF5 z6qy)?W0b7t?g1ryi5?LHf_JFcB{4Cmo}L&~VJB9uq@51zbRs*Qk)3g4Xd{j6f)=pu zJkI9S5hz{`L80;=;M&bW@$th13W7yA%A_WML8D9(7}E)0RAOLktZWh64uSEn1Teyt zt&I`JmLtqvfx&j+e}?U5d)Th|Nv-xyX!Yy6YXv}G9qEO!?}zuP8Ldq3JHHaIOg9^R zmOy)PTR08O%L^FjZfbCoKs{VHMVHd8dZ&M-ZZ7v4&E}@44Gg+7P>%*iw~wV|{Swx| zb=*n#d$V?renr`xm{1Pl57gBvr)nl zTQy9|;EL@zQZ2K+kNyZccJc$ueIpK8BEK&_FU3(onP^2dR87r&nldI=^!dAJt?dY` zQF2;F@?f_U?t_>O6TTp5v)sLc-5J;e5m(VXQr5}jcaR#;UJD@&2{Q2x1Ww?PiSl43 zuAIe=if+FrX2T5)P!JS{yQ0J; zw=SgHSXs62#Q`~-jzMpNQb^{lmy+0J1~WgSew$gW@oVCDn9Z8-KgZ~giRYZa=;Vn}i!*n&LNwab&V) zX$~lJuzjFS3}~}TeL~g0F`=yWLc2gJi@B^_S_y@6qi8^{T{b54Q~E1dIs?Xr+l0|I z)&XyGaCeo*`I$#Uc|27fd&=Xv*W+%E zXqC$@vp0vB5w|q=tg2nM4>j=}6~mRX4Y|3BJ(wHTFXoQA6S8N4g$ow*r8}XA8ovN# z3(J`XHduytfxV?N*J0P#*UFB{+}td4bG6L1E0bM^>jrz9y~CEjQgna)uhIRX7Tq8J z&*%tjC+~^WzyEmSqZ8b{$GU8W=`iO>u`}{}aTac;@l&;nK2*ENceDLbkPm`_$FVr* zvakq6T&zYRU%~P*Y%!6faaKqx%CkW-p z8~wrA;6B(xpwbQkF^;1k;1ujnfW3$`c9P-FiJ^)kn4Q$Olc8_!XJHiiIsov6mSixl ziuxu*on&g@4T#Jc^Oz+O2IZ+4+{77d9MorUdpd*LzWGe~0&XFauKT7Kg=s4FH2=iw zNo~xe&Et>-k$5JxXj|%oq>#?T@pHvuA(!((1P=r}9`h(4jY9~w8HjKYV~gxOQi}$W ze873m<+%VTZxj|m{PS@dX48d(6~J(VKw6OAVUkHJ9FOCS`Mp6L7W+I7lF>NlMVLjg zG{sIhj%8yuN$El`!S;TV7D*y%D~pe|e;twSBvW1U&*U1+!A zS&RHYJ!@;Pw(y7G?F(v$fhe_G_#f1cH5IozGj0cU0>%5C;;oj_ANSwtPkW|#9X9us z6c3b+%`0!cx{`>g+MQG07G94xb{?(^>7hw(ILxgpN4TM_&GtT6L3h=_v+NI2zKf>;+ zi4E$~ra&)!gP^dEex*I8r@n$O?ab0r$xyV(8ao1Yi`ewV;EoBqb%*@XOw z#`*_7@)VYQ34#$ua}rVWw+h=LloU?XqRoyQwPDA~y`mF!jnB?|g&+BpaGyD>^*fFF z%w_F2BpNV}b>M%Cby*Mo8;pKWLjNy#g&c%bcb*;d^r)(&0>k~bnoPJj+g4mEIYJo= zJ`r5XtSX1oQ#?~tY?kveUa~HFy;i!DbTI|?^YNNFD&hjt1`_d+Lc+(OEvo% z9(?!q*WhTDN0xC$`NfK?^GaJ4mM z+7mos@mT((t?`kz(@beMH_T`!E7J-vSS-~vz98Z@Jbw9j^g?nWN0O)6$>=$kC#sr` z#!F@mkvGw)?-`26dk}8O?1CV)OK($g2ekh+6_?jiv4^P; zm|B649*Q=|*$*IYBwPv3ud%*P_A;vf%z(fy5t}HG*+zT_ntpO4RqG5bAGm?G0o))> z0~*>KYW*#?!FP>1WZNo-rgMbkhU| i;#%mY@l4NoPnu!>1% {cb.predict(model, instance)}") - - gc.collect() - - logger.info("-------------------------") - - logger.info("-------------------------") - logger.info("unit tests completed successfully...") diff --git a/tests/test_adaboost.py b/tests/test_adaboost.py new file mode 100644 index 0000000..25d928b --- /dev/null +++ b/tests/test_adaboost.py @@ -0,0 +1,27 @@ +import pandas as pd +from chefboost import Chefboost as cb +from chefboost.commons.logger import Logger + +logger = Logger(module="tests/test_adaboost.py") + + +def test_adaboost(): + config = { + "algorithm": "Regression", + "enableAdaboost": True, + "num_of_weak_classifier": 10, + "enableParallelism": False, + } + df = pd.read_csv("dataset/adaboost.txt") + validation_df = df.copy() + + model = cb.fit(df, config, validation_df=validation_df, silent=True) + + instance = [4, 3.5] + + prediction = cb.predict(model, instance) + + assert prediction == -1 + assert len(model["trees"]) > 1 + + logger.info("✅ adaboost model restoration test done") diff --git a/tests/test_c45.py b/tests/test_c45.py new file mode 100644 index 0000000..bec68ea --- /dev/null +++ b/tests/test_c45.py @@ -0,0 +1,24 @@ +import pandas as pd +from chefboost import Chefboost as cb +from chefboost.commons.logger import Logger + +logger = Logger(module="tests/test_c45.py") + + +def test_c45_for_nominal_features_and_nominal_target(): + df = pd.read_csv("dataset/golf.txt") + model = cb.fit(df, config={"algorithm": "C4.5"}, silent=True) + assert model["config"]["algorithm"] == "C4.5" + logger.info("✅ build c4.5 for nominal and numeric features and nominal target test done") + +def test_c45_for_nominal_and_numeric_features_and_nominal_target(): + df = pd.read_csv("dataset/golf2.txt") + model = cb.fit(df, config={"algorithm": "C4.5"}, silent=True) + assert model["config"]["algorithm"] == "C4.5" + logger.info("✅ build c4.5 for nominal and numeric features and nominal target test done") + +def test_large_dataset(): + df = pd.read_csv("dataset/car.data") + model = cb.fit(df, config={"algorithm": "C4.5"}, silent=True) + assert model["config"]["algorithm"] == "C4.5" + logger.info("✅ build c4.5 for large dataset test done") \ No newline at end of file diff --git a/tests/test_cart.py b/tests/test_cart.py new file mode 100644 index 0000000..8e1c6d9 --- /dev/null +++ b/tests/test_cart.py @@ -0,0 +1,25 @@ +import pandas as pd +from chefboost import Chefboost as cb +from chefboost.commons.logger import Logger + +logger = Logger(module="tests/test_cart.py") + + +def test_cart_for_nominal_features_and_nominal_target(): + df = pd.read_csv("dataset/golf.txt") + model = cb.fit(df, config={"algorithm": "CART"}, silent=True) + assert model["config"]["algorithm"] == "CART" + logger.info("✅ build cart for nominal and numeric features and nominal target test done") + + +def test_cart_for_nominal_and_numeric_features_and_nominal_target(): + df = pd.read_csv("dataset/golf2.txt") + model = cb.fit(df, config={"algorithm": "CART"}, silent=True) + assert model["config"]["algorithm"] == "CART" + logger.info("✅ build cart for nominal and numeric features and nominal target test done") + +def test_large_dataset(): + df = pd.read_csv("dataset/car.data") + model = cb.fit(df, config={"algorithm": "CART"}, silent=True) + assert model["config"]["algorithm"] == "CART" + logger.info("✅ build c4.5 for large dataset test done") \ No newline at end of file diff --git a/tests/test_chaid.py b/tests/test_chaid.py new file mode 100644 index 0000000..45fba69 --- /dev/null +++ b/tests/test_chaid.py @@ -0,0 +1,26 @@ +import pandas as pd +from chefboost import Chefboost as cb +from chefboost.commons.logger import Logger + +logger = Logger(module="tests/test_c45.py") + + +def test_c45_for_nominal_features_and_nominal_target(): + df = pd.read_csv("dataset/golf.txt") + model = cb.fit(df, config={"algorithm": "CHAID"}, silent=True) + assert model["config"]["algorithm"] == "CHAID" + logger.info("✅ build chaid for nominal features and nominal target test done") + + +def test_c45_for_nominal_and_numeric_features_and_nominal_target(): + df = pd.read_csv("dataset/golf2.txt") + model = cb.fit(df, config={"algorithm": "CHAID"}, silent=True) + assert model["config"]["algorithm"] == "CHAID" + logger.info("✅ build chaid for nominal and numeric features and nominal target test done") + + +def test_large_dataset(): + df = pd.read_csv("dataset/car.data") + model = cb.fit(df, config={"algorithm": "CHAID"}, silent=True) + assert model["config"]["algorithm"] == "CHAID" + logger.info("✅ build c4.5 for large dataset test done") diff --git a/tests/test_gbm.py b/tests/test_gbm.py new file mode 100644 index 0000000..6800f4c --- /dev/null +++ b/tests/test_gbm.py @@ -0,0 +1,48 @@ +import pandas as pd +from chefboost import Chefboost as cb +from chefboost.commons.logger import Logger + +logger = Logger(module="tests/test_gbm.py") + + +def test_gbm_regression(): + config = { + "algorithm": "Regression", + "enableGBM": True, + "epochs": 10, + "learning_rate": 1, + } + + df = pd.read_csv("dataset/golf4.txt") + validation_df = pd.read_csv("dataset/golf4.txt") + + model = cb.fit(df, config, validation_df=validation_df, silent=True) + assert model["config"]["algorithm"] == "Regression" + assert len(model["trees"]) > 1 + + features = ["Sunny", 85, 85, "Weak"] + target = 25 + prediction = cb.predict(model, features) + assert abs(prediction - target) < 1 + + +def test_gbm_classification(): + config = { + "algorithm": "ID3", + "enableGBM": True, + "epochs": 10, + "learning_rate": 1, + } + + df = pd.read_csv( + "dataset/iris.data", + names=["Sepal length", "Sepal width", "Petal length", "Petal width", "Decision"], + ) + validation_df = df.copy() + + model = cb.fit(df, config, validation_df=validation_df, silent=True) + + instance = [7.0, 3.2, 4.7, 1.4] + target = "Iris-versicolor" + prediction = cb.predict(model, instance) + assert prediction == target diff --git a/tests/test_id3.py b/tests/test_id3.py new file mode 100644 index 0000000..d83cbf9 --- /dev/null +++ b/tests/test_id3.py @@ -0,0 +1,114 @@ +import pandas as pd +from chefboost import Chefboost as cb +from chefboost.commons.logger import Logger + +logger = Logger(module="tests/test_id3.py") + + +def test_build_id3_with_no_config(): + df = pd.read_csv("dataset/golf.txt") + model = cb.fit(df, silent=True) + assert model["config"]["algorithm"] == "ID3" + logger.info("✅ standard id3 test done") + + +def test_build_id3_with_internal_validation_df(): + df = pd.read_csv("dataset/golf.txt") + validation_df = pd.read_csv("dataset/golf.txt") + + model = cb.fit(df, validation_df=validation_df, silent=True) + + assert model["config"]["algorithm"] == "ID3" + + validation_eval_results = model["evaluation"]["validation"] + + assert validation_eval_results.get("Accuracy", 0) > 99 + assert validation_eval_results.get("Precision", 0) > 99 + assert validation_eval_results.get("Recall", 0) > 99 + assert validation_eval_results.get("F1", 0) > 99 + assert validation_eval_results.get("Instances", 0) == validation_df.shape[0] + assert "Confusion matrix" in validation_eval_results.keys() + assert "Labels" in validation_eval_results.keys() + + # decision_rules = model["trees"][0].__dict__["__name__"]+".py" + decision_rules = model["trees"][0].__dict__["__spec__"].origin + + fi_df = cb.feature_importance(decision_rules, silent=True) + assert fi_df.shape[0] == 4 + + logger.info("✅ id3 test with internal validation data frame done") + + +def test_build_id3_with_external_validation_set(): + df = pd.read_csv("dataset/golf.txt") + model = cb.fit(df, silent=True) + + assert model["config"]["algorithm"] == "ID3" + + validation_df = pd.read_csv("dataset/golf.txt") + results = cb.evaluate(model, validation_df, silent=True) + + assert results.get("Accuracy", 0) > 99 + assert results.get("Precision", 0) > 99 + assert results.get("Recall", 0) > 99 + assert results.get("F1", 0) > 99 + assert results.get("Instances", 0) == validation_df.shape[0] + assert "Confusion matrix" in results.keys() + assert "Labels" in results.keys() + + logger.info("✅ id3 test with external validation data frame done") + + +def test_model_restoration(): + df = pd.read_csv("dataset/golf.txt") + model = cb.fit(df, silent=True) + assert model["config"]["algorithm"] == "ID3" + + cb.save_model(model) + + restored_model = cb.load_model("model.pkl") + + assert restored_model["config"]["algorithm"] == "ID3" + + instance = ["Sunny", "Hot", "High", "Weak"] + + prediction = cb.predict(restored_model, instance) + assert prediction == "No" + + logger.info("✅ id3 model restoration test done") + + +def test_build_id3_for_nominal_and_numeric_features_nominal_target(): + df = pd.read_csv("dataset/golf2.txt") + model = cb.fit(df, silent=True) + + assert model["config"]["algorithm"] == "ID3" + + instance = ["Sunny", 85, 85, "Weak"] + prediction = cb.predict(model, instance) + assert prediction == "No" + logger.info("✅ build id3 for nominal and numeric features and nominal target test done") + + +def test_large_data_set(): + df = pd.read_csv("dataset/car.data") + model = cb.fit(df, silent=True) + + assert model["config"]["algorithm"] == "ID3" + + instance = ["vhigh", "vhigh", 2, "2", "small", "low"] + prediction = cb.predict(model, instance) + assert prediction == "unacc" + + instance = ["high", "high", "4", "more", "big", "high"] + prediction = cb.predict(model, instance) + assert prediction == "acc" + + +def test_iris_dataset(): + df = pd.read_csv( + "dataset/iris.data", + names=["Sepal length", "Sepal width", "Petal length", "Petal width", "Decision"], + ) + model = cb.fit(df, silent=True) + assert model["config"]["algorithm"] == "ID3" diff --git a/tests/test_randomforest.py b/tests/test_randomforest.py new file mode 100644 index 0000000..3b244a5 --- /dev/null +++ b/tests/test_randomforest.py @@ -0,0 +1,55 @@ +import pandas as pd +from chefboost import Chefboost as cb +from chefboost.commons.logger import Logger + +logger = Logger(module="tests/test_randomforest.py") + + +def test_randomforest_for_classification(): + config = { + "algorithm": "ID3", + "enableRandomForest": True, + "num_of_trees": 3, + } + df = pd.read_csv("dataset/car.data") + + model = cb.fit(df, config, silent=True) + + assert model["config"]["algorithm"] == "ID3" + assert model["evaluation"]["train"]["Accuracy"] > 90 + + # feature importance + decision_rules = [] + for tree in model["trees"]: + decision_rule = tree.__dict__["__spec__"].origin + decision_rules.append(decision_rule) + + df = cb.feature_importance(decision_rules, silent=True) + assert df.shape[0] == 6 + + # this is not in train data + instance = ["high", "high", 4, "more", "big", "high"] + prediction = cb.predict(model, instance) + assert prediction in ["unacc", "acc"] + + instance = ["vhigh", "vhigh", 2, "2", "small", "low"] + prediction = cb.predict(model, instance) + assert prediction in ["unacc", "acc"] + + +def test_randomforest_for_regression(): + config = { + "algorithm": "ID3", + "enableRandomForest": True, + "num_of_trees": 5, + } + df = pd.read_csv("dataset/car_reg.data") + model = cb.fit(df, config, silent=True) + + assert model["evaluation"]["train"]["MAE"] < 10 + assert model["config"]["algorithm"] == "Regression" + + instance = ["high", "high", 4, "more", "big", "high"] + target = 100 + prediction = cb.predict(model, instance) + assert abs(prediction - target) < 30 diff --git a/tests/test_regression.py b/tests/test_regression.py new file mode 100644 index 0000000..35ffea4 --- /dev/null +++ b/tests/test_regression.py @@ -0,0 +1,27 @@ +import pandas as pd +from chefboost import Chefboost as cb +from chefboost.commons.logger import Logger + +logger = Logger(module="tests/test_regression.py") + + +def test_c45_for_nominal_features_and_numeric_target(): + df = pd.read_csv("dataset/golf3.txt") + _ = cb.fit(df, config={"algorithm": "Regression"}, silent=True) + logger.info("✅ build regression for nominal features and numeric target test done") + + +def test_c45_for_nominal_and_numeric_features_and_numeric_target(): + df = pd.read_csv("dataset/golf4.txt") + _ = cb.fit(df, config={"algorithm": "Regression"}, silent=True) + logger.info( + "✅ build regression tree for nominal and numeric features and numeric target test done" + ) + + +def test_switching_to_regression_tree(): + df = pd.read_csv("dataset/golf4.txt") + config = {"algorithm": "ID3"} + model = cb.fit(df, config, silent=True) + assert model["config"]["algorithm"] == "Regression" + logger.info("✅ switching to regression tree test done")