import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import json


from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.metrics import PrecisionRecallDisplay, precision_recall_curve
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict
from sklearn import metrics
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.ensemble import VotingClassifier
import statsmodels.api as sm
import mlflow
import mlflow.sklearn
from statistics import mean
from mlflow.models.signature import infer_signature
import shap
from catboost import CatBoostClassifier, Pool


class myModel:
    def __init__(self, name, model):
        self.name=name
        self.model=model
        
        
    def log_transform(self, columns:list):
        """Log transforms columns from dataframe that has been cross validated with a hold out dataset

        Args:
            columns: A list of strings representing variable names

        Returns:
            2 train & test dataframes with log transformations applied to applicable columns
        """
        for col in columns:
            Xtrain[col]=np.log(Xtrain[col])
            Xtest[col]=np.log(Xtest[col])

        return Xtrain,Xtest



    def make_data(self, str1:str, str2:str, str3:str):
        """Performs hold out cross validation on dataset with options for selecting variables

        Args:
            str1: A string denoting if the function should return a full or selective dataset
            str2: A string denoting if the function should perform sampling or not
            str3: A list of strings representing variables to select in feature selection


        Returns:
            4 datasets of train test split data with applicable manipulations dependent upon the arguments given (i.e., feature selection, oversampling, none)
        """
        train = pd.read_excel('BBBC-Train.xlsx')
        test = pd.read_excel('BBBC-Test.xlsx')
        data = pd.concat([test, train])
        data.drop('Observation', axis=1, inplace=True)# keeping this column creates highly accurate results about 97% 
                                                        # seems like overfitting...
    

        if str2 == 'before':
            if str1 == 'full':
                X = data.drop(['Choice'], axis=1)
                y = data['Choice']

                Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=7)
                return Xtrain, Xtest, ytrain, ytest
            elif str1 == 'slim':
                X = data[str3]
                y = data['Choice']
                Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=7)
                return Xtrain, Xtest, ytrain, ytest
            else:
                print('Error')
        elif str2 == 'after':
            if str1 == 'full':
                X = data.drop(['Choice'], axis=1)
                y = data['Choice']
                Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=7)
              # define Pipeline and do random undersampling then SMOTE oversampling of dataset
              # [.8, .5, .2] over

                under = RandomUnderSampler(sampling_strategy=.2)#.2
                over = SMOTE(sampling_strategy=.3) # want about 1500 in minority class and 2000 majority

              # define pipeline steps
                steps = [('u', under), ('o', over)]
                pipeline = Pipeline(steps=steps)
                Xtrain, ytrain = pipeline.fit_resample(Xtrain, ytrain)
                return Xtrain, Xtest, ytrain, ytest
            elif str1 == 'slim':
                X = data[str3]
                y = data['Choice']
                Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=7)
                # define Pipeline and do random undersampling then SMOTE oversampling of dataset
                under = RandomUnderSampler(sampling_strategy=.2)
                over = SMOTE(sampling_strategy=.3) # want about 1500 in minority class and 2000 majority

                # define pipeline steps
                steps = [('u', under), ('o', over)]
                pipeline = Pipeline(steps=steps)
                Xtrain, ytrain = pipeline.fit_resample(Xtrain, ytrain)
                return Xtrain, Xtest, ytrain, ytest
        else:
            print('Error')


    def fit_model(self,model:str,scoring:list,tuning=False, params=list):
        """Fits model and scores with cross validations based on selected metrics
        
        Args:
            model: A chosen Ml model for training
            scoring: A list of metrics to use for scoring over default 5 folds
            tuning: A boolean representing whether hyper parameter tuning should be performed
            params: A dictionary of parameters to be tuned for the given model
            
        Returns:
            1. fit model
            2. A dict holding average metrics over each CV fold
        
        
        """
        model=model
        if tuning == False:

            fit_model = model.fit(Xtrain,ytrain)

            scores = cross_validate(model, Xtest, ytest, scoring=scoring)
            
            # print(f"Average Test Accuracy Score: {scores['test_accuracy'].mean()}")
            
            return fit_model, scores
            
        elif tuning == True:
            
            model = GridSearchCV(model, params, cv=5)

            model.fit(Xtrain,ytrain)

            fit_model = model.best_estimator_

            scores = cross_validate(model, Xtest, ytest, scoring=scoring)
            # print(f"Average Test Accuracy Score: {scores['test_accuracy'].mean()}")
            
            return fit_model, scores
        


 
    
    
    
def get_profit(df):
    
    
   
    # profit = revenue - cost

    #cost per book
    mailing_cost = 0.65
    manuf_cost = 15
    overhead_cost = manuf_cost * .45

    #revenue
    selling_price = 31.95
    
    all_cust = 50000
    # profit per book
    prof_book = (selling_price - mailing_cost - manuf_cost - overhead_cost)

    n_cust = 103 # customers who purchased in test set
    total_cust = 780 # total custoemrs in test set

    pct_buying = .132
    
    
    cost_nomodel = mailing_cost*all_cust
    resp_nomodel = pct_buying*all_cust

    # create empty columns
    df['Profit'] = 0
    df['No Model Profit'] = 0
    df['Profit Boost From Model'] = 0

    for i in df.index:
        not_sold_by_model = df['false_negatives'][i] # false negatives
        sold_by_model = df['true_positives'][i] # true positives 
        missed_opportunities = df['false_positives'][i] # false positives
        total_mailed = not_sold_by_model + sold_by_model
        no_model_total = n_cust
        total_cust = 780

        # subtracting mailing expenses and calculating profit for total costs predicted by model

        # penalizes for false positives
        profit = (((not_sold_by_model + sold_by_model) * prof_book)- missed_opportunities) - (total_mailed * mailing_cost)
        
      
         # uncomment for original
        no_model_profit = ((no_model_total * prof_book)-no_model_total) - (total_cust * mailing_cost)


        df['Profit'][i] = profit 
        df['No Model Profit'][i] = no_model_profit
        df['Profit Boost From Model'][i] = ((profit - no_model_profit)/no_model_profit)*100
        


    return df






def calc_metrics2(models):

    
    models = models[['Model','accuracy_score', 'precision_score','f1_score', 'recall_score']]
     
    pd.melt(models, id_vars=['Model'], value_vars=['accuracy_score', 'precision_score', 'f1_score', 'recall_score'])

    fig = px.bar(models, x="Model", y="value",
                color = "variable", barmode="group")

    fig.show()
    

def json_to_df(json_file, name):
      
    # store json file to dictionary
    with open(json_file) as file:
        data = json.load(file)
    data = pd.DataFrame(data, index=[x for x in range(1, len(data))])
    data['Model'] = name
    
    return data


model = LogisticRegression(random_state=7, max_iter=1000)
m1 = myModel('EDA',model)

# make data
x_train, x_test, y_train, y_test = m1.make_data("full", "before", _)


x_train.head()


# how many observations will we be training with?
x_train.shape

(3120, 10)


# is response variable balanced?
fig, axs = plt.subplots(2)
fig.suptitle('Observing Imbalanced Data')
axs[0].hist(y_train)
axs[1].hist(y_test)

(array([677.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0., 103.]),
 array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ]),
 <BarContainer object of 10 artists>)


# what kind of data types in train and test
print("Training:")
print(x_train.info())
print()
print("Testing:")
print(x_test.info())

Training:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3120 entries, 576 to 175
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   Gender            3120 non-null   int64
 1   Amount_purchased  3120 non-null   int64
 2   Frequency         3120 non-null   int64
 3   Last_purchase     3120 non-null   int64
 4   First_purchase    3120 non-null   int64
 5   P_Child           3120 non-null   int64
 6   P_Youth           3120 non-null   int64
 7   P_Cook            3120 non-null   int64
 8   P_DIY             3120 non-null   int64
 9   P_Art             3120 non-null   int64
dtypes: int64(10)
memory usage: 268.1 KB
None

Testing:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 780 entries, 284 to 1277
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   Gender            780 non-null    int64
 1   Amount_purchased  780 non-null    int64
 2   Frequency         780 non-null    int64
 3   Last_purchase     780 non-null    int64
 4   First_purchase    780 non-null    int64
 5   P_Child           780 non-null    int64
 6   P_Youth           780 non-null    int64
 7   P_Cook            780 non-null    int64
 8   P_DIY             780 non-null    int64
 9   P_Art             780 non-null    int64
dtypes: int64(10)
memory usage: 67.0 KB
None


x_train.describe()


fig, axs = plt.subplots(2,5, figsize = (20,10))
fig.suptitle('Observing Normality in Data')
axs[0][0].hist(x_train['Gender'])
axs[0][1].hist(x_train['Amount_purchased'])
axs[0][2].hist(x_train['Frequency'])
axs[0][3].hist(x_train['Last_purchase'])
axs[0][4].hist(x_train['First_purchase'])
axs[1][0].hist(x_train['P_Child'])
axs[1][1].hist(x_train['P_Youth'])
axs[1][2].hist(x_train['P_Cook'])
axs[1][3].hist(x_train['P_DIY'])
axs[1][4].hist(x_train['P_Art'])

#define subplot titles
axs[0, 0].set_title('Gender')
axs[0, 1].set_title('Amount_purchased')
axs[0, 2].set_title('Frequency')
axs[0, 3].set_title('Last_purchase')
axs[0, 4].set_title('First_purchase')
axs[1, 0].set_title('P_Child')
axs[1, 1].set_title('P_Youth')
axs[1, 2].set_title('P_Cook')
axs[1, 3].set_title('P_DIY')
axs[1, 4].set_title('P_Art')

Text(0.5, 1.0, 'P_Art')


# log transform data
log_xtrain = np.log(x_train)


fig, axs = plt.subplots(1,4, figsize = (20,10))
fig.suptitle('Observing Normality in Data')
#axs[0][0].hist(log_xtrain['Gender'])
axs[0].hist(log_xtrain['Amount_purchased'])
axs[1].hist(log_xtrain['Frequency'])
axs[2].hist(log_xtrain['Last_purchase'])
axs[3].hist(log_xtrain['First_purchase'])
#axs[1][0].hist(log_xtrain['P_Child'])
# axs[1][1].hist(log_xtrain['P_Youth'])
# axs[1][2].hist(log_xtrain['P_Cook'])
# axs[1][3].hist(log_xtrain['P_DIY'])
# axs[1][4].hist(log_xtrain['P_Art'])

#define subplot titles
#axs[0, 0].set_title('Gender')
axs[0].set_title('Amount_purchased')
axs[1].set_title('Frequency')
axs[2].set_title('Last_purchase')
axs[3].set_title('First_purchase')
# axs[1, 0].set_title('P_Child')
# axs[1, 1].set_title('P_Youth')
# axs[1, 2].set_title('P_Cook')
# axs[1, 3].set_title('P_DIY')
# axs[1, 4].set_title('P_Art')

divide by zero encountered in log

Text(0.5, 1.0, 'First_purchase')


train0 = pd.read_excel('BBBC-Train.xlsx')
test0 = pd.read_excel('BBBC-Test.xlsx')
data0 = pd.concat([test0, train0])
#data0.drop('Observation', axis=1, inplace=True)
plt.figure(figsize=(1,5))
sns.pairplot(data0)
plt.show()

<Figure size 100x500 with 0 Axes>


plt.figure(figsize=(12, 7))
sns.heatmap(data0.corr(), annot = True, vmin=0, vmax=1)
plt.show()


# make model
model = LogisticRegression(random_state=7, max_iter=1000)

m1 = myModel('Logistic Regression',model)

# make data
Xtrain,Xtest,ytrain,ytest = m1.make_data("full", "before", _)
print(ytrain.hist())

# make model
logit_model, logit_scores = m1.fit_model(model, ['f1','accuracy'], tuning=False)

# make predictions
logit_preds = m1.make_preds(logit_model)

# make roc plots
logit_df = m1.make_roc(logit_preds)

# make classification report
logit_rpt = m1.make_report(logit_preds, logit_df)

# make confusion matrix
logit_cm = m1.make_cm(logit_preds, logit_model, logit_rpt)

Axes(0.125,0.11;0.775x0.77)
Average Test Accuracy Score: 0.8833333333333332


# make model
model = LogisticRegression(random_state=7, max_iter=1000)

m1 = myModel('Logistic Regression',model)

# make data
Xtrain,Xtest,ytrain,ytest = m1.make_data("full", "after", _)
print(ytrain.hist())

# make model
logit_model, logit_scores = m1.fit_model(model, ['f1','accuracy'], tuning=False)

# make predictions
logit_preds = m1.make_preds(logit_model)

# make roc plots
logit_df = m1.make_roc(logit_preds)

# make classification report
logit_rpt = m1.make_report(logit_preds, logit_df)

# make confusion matrix
logit_cm = m1.make_cm(logit_preds, logit_model, logit_rpt)

Axes(0.125,0.11;0.775x0.77)
Average Test Accuracy Score: 0.8833333333333332


mlflow.set_tracking_uri("")

mlflow.set_experiment(experiment_name="BBBC_modeling")


with mlflow.start_run(run_name='F_b_logit'):
    
    model = LogisticRegression(random_state=7, max_iter=1000)

    m1 = myModel('Logistic Regression',model)
    
   
    # make data
    Xtrain,Xtest,ytrain,ytest = m1.make_data("full", "before", _)
    print("Data Loaded")
    print("Building Model...")
 


    # make model
    logit_model,logit_scores = m1.fit_model(model, ['f1','accuracy', 'precision', 'recall'], tuning=False)

    
    #signature?
    signature = infer_signature(Xtest, logit_model.predict(Xtest))
    
    #Build the Evaluation Dataset from the test set
    eval_data = Xtest
    eval_data["label"] = ytest
    
    undersamp = .2
    oversamp = .3
    
    
    # log metrics
    mlflow.log_metric('accuracy', mean(logit_scores['test_accuracy']))
    mlflow.log_metric('f1', mean(logit_scores['test_f1']))
    mlflow.log_metric('precision', mean(logit_scores['test_precision']))
    mlflow.log_metric('recall', mean(logit_scores['test_recall']))
    mlflow.log_metric('oversample', oversamp)
    mlflow.log_metric('undersample', undersamp)
    

    # log model
    mlflow.sklearn.log_model(logit_model, "logit model")
    model_uri = mlflow.get_artifact_uri("logit model")
    print("Model Loaded")
    
    # evaluate model
    result = mlflow.evaluate(
        model_uri,
        eval_data,
        targets="label",
        model_type="classifier",
        evaluators=["default"],
    )
    
    


    
mlflow.end_run()

Data Loaded
Average Test Accuracy Score: 0.8833333333333332

/Users/coolkid/anaconda3/lib/python3.10/site-packages/mlflow/models/signature.py:137: UserWarning:

Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.

2023/06/13 16:11:02 WARNING mlflow.pyfunc: Detected one or more mismatches between the model's dependencies and the current Python environment:
 - mlflow (current: 2.3.2, required: mlflow==2.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
2023/06/13 16:11:02 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/06/13 16:11:02 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.

Model Loaded

2023/06/13 16:11:02 INFO mlflow.models.evaluation.default_evaluator: Shap explainer Linear is used.
/Users/coolkid/anaconda3/lib/python3.10/site-packages/mlflow/shap.py:459: UserWarning:

Unable to serialize underlying model using MLflow, will use SHAP serialization

2023/06/13 16:11:02 WARNING mlflow.models.evaluation.default_evaluator: Logging explainer failed. Reason: AttributeError("'LogisticRegression' object has no attribute 'save'"). Set logging level to DEBUG to see the full traceback.
/Users/coolkid/anaconda3/lib/python3.10/site-packages/shap/plots/_beeswarm.py:340: UserWarning:

No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored

/Users/coolkid/anaconda3/lib/python3.10/site-packages/shap/plots/_beeswarm.py:664: UserWarning:

No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored


# no significant improvment with log transformation

mlflow.set_tracking_uri("")

mlflow.set_experiment(experiment_name="BBBC_modeling")


with mlflow.start_run(run_name='b_rf_final'):
    model = RandomForestClassifier(random_state=7)

    m1 = myModel('Tuned Random Forest',model)

    # make data
    
    '''
    
    Random forest is an ideal algorithm to deal with the extreme imbalance owing to two main reasons. 
    Firstly, the ability to incorporate class weights into the random forest classifier makes it cost-sensitive; 
    hence it penalizes misclassifying the minority class. 
    
    '''
    
    Xtrain,Xtest,ytrain,ytest = m1.make_data("full", "before", _)
    print("Data Loaded")
    print("Building Model...")
 
       
    # make model
    params = {'n_estimators': [25,50,100,200,500],
                 'max_depth': [2,5,8, 50]}
    rf_model, rf_scores = m1.fit_model(model, ['f1','accuracy', 'precision', 'recall'], tuning=True, params=params)
    
    
    #signature?
    signature = infer_signature(Xtest, rf_model.predict(Xtest))
    
    #Build the Evaluation Dataset from the test set
    eval_data = Xtest
    eval_data["label"] = ytest
    
    undersamp = 0
    oversamp = 0
 
    
    rf_params = rf_model.get_params()
    
    # # log parameters
    mlflow.log_param('n_estimators', rf_params['n_estimators'])
    mlflow.log_param('max_depth', rf_params['max_depth'])
   
       
    
    # log metrics
    mlflow.log_metric('accuracy', mean(rf_scores['test_accuracy']))
    mlflow.log_metric('f1', mean(rf_scores['test_f1']))
    mlflow.log_metric('precision', mean(rf_scores['test_precision']))
    mlflow.log_metric('recall', mean(rf_scores['test_recall']))
    mlflow.log_metric('oversample', oversamp)
    mlflow.log_metric('undersample', undersamp)

    # log model
    mlflow.sklearn.log_model(rf_model, "log rf model")
    model_uri = mlflow.get_artifact_uri("log rf model")
    print("Model Loaded")
    
    
    explainer = shap.Explainer(rf_model, Xtest)
    shap_values = explainer(Xtest)
    shap_exp = shap_values
    
    #evaluate model
    result = mlflow.evaluate(
        model_uri,
        eval_data,
        targets="label",
        model_type="classifier",
        evaluators="default",
        evaluator_config= {"log_model_explainability":False} # error called when creating beeswarm plot,
    )
    
     # save results in json
    result.save("")
    print("Metrics Saved")




    
mlflow.end_run()

Data Loaded
Building Model...
Average Test Accuracy Score: 0.8705128205128204

/Users/coolkid/anaconda3/lib/python3.10/site-packages/mlflow/models/signature.py:137: UserWarning:

Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.

Model Loaded

 98%|===================| 1534/1560 [00:21<00:00]        2023/06/13 16:15:27 WARNING mlflow.pyfunc: Detected one or more mismatches between the model's dependencies and the current Python environment:
 - mlflow (current: 2.3.2, required: mlflow==2.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
2023/06/13 16:15:27 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/06/13 16:15:27 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.

Metrics Saved

<Figure size 1050x700 with 0 Axes>


# no improvement with log transformattion

mlflow.set_tracking_uri("")

mlflow.set_experiment(experiment_name="BBBC_modeling")


with mlflow.start_run(run_name='b_xgb_final'):
    model = xgb.XGBClassifier(random_state=7,
                             eta=.2,
                             max_depth=3,
                             gamma=1,
                             subsample=1)

    m1 = myModel('Tuned XGB Model',model)

    # make data
    Xtrain,Xtest,ytrain,ytest = m1.make_data("full", "before", _)
    print("Data Loaded")
    print("Building Model...")
 
     

    # make model
    params = {'eta' : [0.01, 0.2],
          'max_depth': [3],
          'gamma': [0 , .1],
          'subsample': [0.5, 1]}
    
    tuned_xgb_model, tuned_xgb_scores = m1.fit_model(model, ['f1','accuracy', 'precision', 'recall'], tuning=False, params=params)

    
    #signature?
    signature = infer_signature(Xtest, tuned_xgb_model.predict(Xtest))
    
    #Build the Evaluation Dataset from the test set
    eval_data = Xtest
    eval_data["label"] = ytest
    undersample = 0
    oversample = 0
    
    
    xgb_params = tuned_xgb_model.get_params()
    
    # log parameters
    mlflow.log_param('eta', xgb_params['eta'])
    mlflow.log_param('max_depth', xgb_params['max_depth'])
    mlflow.log_param('gamma', xgb_params['gamma'])
    mlflow.log_param('subsample', xgb_params['subsample'])
   
    
    
    # log metrics
    mlflow.log_metric('accuracy', mean(tuned_xgb_scores['test_accuracy']))
    mlflow.log_metric('f1', mean(tuned_xgb_scores['test_f1']))
    mlflow.log_metric('precision', mean(tuned_xgb_scores['test_precision']))
    mlflow.log_metric('recall', mean(tuned_xgb_scores['test_recall']))
    mlflow.log_metric('oversample', oversample)
    mlflow.log_metric('undersample', undersample)

    # log model
    mlflow.sklearn.log_model(tuned_xgb_model, "xgb model")
    model_uri = mlflow.get_artifact_uri("xgb model")
    print("Model Loaded")

    
    # evaluate model
    result = mlflow.evaluate(
        model_uri,
        eval_data,
        targets="label",
        model_type="classifier",
        evaluators=["default"],
    )
    
    # save results in json
    result.save("")
    print("Metrics Saved")
    


    
mlflow.end_run()

Data Loaded
Building Model...
Average Test Accuracy Score: 0.8743589743589745

/Users/coolkid/anaconda3/lib/python3.10/site-packages/mlflow/models/signature.py:137: UserWarning:

Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.

2023/06/13 16:16:02 WARNING mlflow.pyfunc: Detected one or more mismatches between the model's dependencies and the current Python environment:
 - mlflow (current: 2.3.2, required: mlflow==2.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
2023/06/13 16:16:02 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/06/13 16:16:02 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.

Model Loaded

2023/06/13 16:16:02 INFO mlflow.models.evaluation.default_evaluator: Shap explainer Tree is used.
/Users/coolkid/anaconda3/lib/python3.10/site-packages/mlflow/shap.py:459: UserWarning:

Unable to serialize underlying model using MLflow, will use SHAP serialization

2023/06/13 16:16:03 WARNING mlflow.models.evaluation.default_evaluator: Logging explainer failed. Reason: AttributeError("'TreeEnsemble' object has no attribute 'save'"). Set logging level to DEBUG to see the full traceback.
/Users/coolkid/anaconda3/lib/python3.10/site-packages/shap/plots/_beeswarm.py:340: UserWarning:

No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored

/Users/coolkid/anaconda3/lib/python3.10/site-packages/shap/plots/_beeswarm.py:664: UserWarning:

No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored

Metrics Saved


# catboost_model.get_all_params()


# catboost_model.get_params()


# no significant difference after log transformation

mlflow.set_tracking_uri("")

mlflow.set_experiment(experiment_name="BBBC_modeling")


with mlflow.start_run(run_name='b_catboost_final'):
    
    
    params = {'iterations':[2,4, 8, 20, 50],
            'depth': [6,7,8,9]}


    model = CatBoostClassifier(bagging_temperature=0,
                               learning_rate=1,
                           loss_function='Logloss',
                           verbose=False,
                           random_state=7)

    m1 = myModel('CatBoost Model',model)

    # make data
    Xtrain,Xtest,ytrain,ytest = m1.make_data("full", "before", _)
    print("Data Loaded")
    print("Building Model...")
    
    catboost_model, catboost_scores = m1.fit_model(model, ['f1','accuracy', 'precision', 'recall'], tuning=True, params=params)
    
    catboost_params = catboost_model.get_params()

    
    #signature?
    signature = infer_signature(Xtest, catboost_model.predict(Xtest))
    
    #Build the Evaluation Dataset from the test set
    eval_data = Xtest
    eval_data["label"] = ytest
    undersample = 0
    oversample = 0
    
    
     # log parameters
    mlflow.log_param('iterations', catboost_params['iterations'])
    mlflow.log_param('learning_rate', catboost_params['learning_rate'])
    mlflow.log_param('depth', catboost_params['depth'])
    mlflow.log_param('bagging_temperature', catboost_params['bagging_temperature'])
    mlflow.log_param('loss_function', catboost_params['loss_function'])
    
    # log metrics
    mlflow.log_metric('accuracy', mean(catboost_scores['test_accuracy']))
    mlflow.log_metric('f1', mean(catboost_scores['test_f1']))
    mlflow.log_metric('precision', mean(catboost_scores['test_precision']))
    mlflow.log_metric('recall', mean(catboost_scores['test_recall']))
    mlflow.log_metric('oversample', oversample)
    mlflow.log_metric('undersample', undersample)
     
      

    # log model
    mlflow.sklearn.log_model(catboost_model, "catboost model")
    model_uri = mlflow.get_artifact_uri("catboost model")
    print("Model Loaded")

    
    # evaluate model
    result = mlflow.evaluate(
        model_uri,
        eval_data,
        targets="label",
        model_type="classifier",
        evaluators=["default"],
    )
    
    # save results in json
    result.save("")
    print("Metrics Saved")
    


    
mlflow.end_run()

Data Loaded
Building Model...
Average Test Accuracy Score: 0.8743589743589745

/Users/coolkid/anaconda3/lib/python3.10/site-packages/mlflow/models/signature.py:137: UserWarning:

Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.

2023/06/13 16:17:13 WARNING mlflow.pyfunc: Detected one or more mismatches between the model's dependencies and the current Python environment:
 - mlflow (current: 2.3.2, required: mlflow==2.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
2023/06/13 16:17:13 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/06/13 16:17:13 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2023/06/13 16:17:13 WARNING mlflow.models.evaluation.default_evaluator: Computing sklearn model score failed: TypeError("CatBoostClassifier.score() got an unexpected keyword argument 'sample_weight'"). Set logging level to DEBUG to see the full traceback.

Model Loaded

2023/06/13 16:17:14 INFO mlflow.models.evaluation.default_evaluator: Shap explainer Tree is used.
/Users/coolkid/anaconda3/lib/python3.10/site-packages/mlflow/shap.py:459: UserWarning:

Unable to serialize underlying model using MLflow, will use SHAP serialization

2023/06/13 16:17:14 WARNING mlflow.models.evaluation.default_evaluator: Logging explainer failed. Reason: AttributeError("'TreeEnsemble' object has no attribute 'save'"). Set logging level to DEBUG to see the full traceback.
/Users/coolkid/anaconda3/lib/python3.10/site-packages/shap/plots/_beeswarm.py:340: UserWarning:

No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored

/Users/coolkid/anaconda3/lib/python3.10/site-packages/shap/plots/_beeswarm.py:664: UserWarning:

No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored

Metrics Saved

Logistic Regression

Back to Top


# significant improvement after log trasnformation and resampling

mlflow.set_tracking_uri("")

mlflow.set_experiment(experiment_name="BBBC_modeling")


with mlflow.start_run(run_name='a_log_logit_final'):
    
    model = LogisticRegression(random_state=7, max_iter=1000)

    m1 = myModel('Log Logistic Regression',model)
    
   
    # make data
    Xtrain,Xtest,ytrain,ytest = m1.make_data("full", "after", _)
    print("Data Loaded")
    print("Building Model...")
    
     # log transform data
    Xtrain, Xtest = m1.log_transform(['Frequency', 'First_purchase'])

    # make model
    log_logit_model,log_logit_scores = m1.fit_model(model, ['f1','accuracy', 'precision', 'recall'], tuning=False)
    
    
    #signature?
    signature = infer_signature(Xtest, log_logit_model.predict(Xtest))
    
    #Build the Evaluation Dataset from the test set
    eval_data = Xtest
    eval_data["label"] = ytest
    
    undersamp = .2
    oversamp = .3

   
    
    
    # log metrics
    mlflow.log_metric('accuracy', mean(log_logit_scores['test_accuracy']))
    mlflow.log_metric('f1', mean(log_logit_scores['test_f1']))
    mlflow.log_metric('precision', mean(log_logit_scores['test_precision']))
    mlflow.log_metric('recall', mean(log_logit_scores['test_recall']))
    mlflow.log_metric('oversample', oversamp)
    mlflow.log_metric('undersample', undersamp)
    

    # log model
    mlflow.sklearn.log_model(log_logit_model, "log_logit model after002003")
    model_uri = mlflow.get_artifact_uri("log_logit model after002003")
    print("Model Loaded")
    
    # evaluate model
    result = mlflow.evaluate(
        model_uri,
        eval_data,
        targets="label",
        model_type="classifier",
        evaluators=["default"],
    )
    
    # save results in json
    result.save("")
    print("Metrics Saved")


    
mlflow.end_run()

Data Loaded
Building Model...
Average Test Accuracy Score: 0.8923076923076924

/Users/coolkid/anaconda3/lib/python3.10/site-packages/mlflow/models/signature.py:137: UserWarning:

Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.

2023/06/13 16:21:46 WARNING mlflow.pyfunc: Detected one or more mismatches between the model's dependencies and the current Python environment:
 - mlflow (current: 2.3.2, required: mlflow==2.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
2023/06/13 16:21:46 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/06/13 16:21:46 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.

Model Loaded

2023/06/13 16:21:47 INFO mlflow.models.evaluation.default_evaluator: Shap explainer Linear is used.
/Users/coolkid/anaconda3/lib/python3.10/site-packages/mlflow/shap.py:459: UserWarning:

Unable to serialize underlying model using MLflow, will use SHAP serialization

2023/06/13 16:21:47 WARNING mlflow.models.evaluation.default_evaluator: Logging explainer failed. Reason: AttributeError("'LogisticRegression' object has no attribute 'save'"). Set logging level to DEBUG to see the full traceback.
/Users/coolkid/anaconda3/lib/python3.10/site-packages/shap/plots/_beeswarm.py:340: UserWarning:

No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored

/Users/coolkid/anaconda3/lib/python3.10/site-packages/shap/plots/_beeswarm.py:664: UserWarning:

No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored

Metrics Saved


# no significant improvment with log transformation

mlflow.set_tracking_uri("")

mlflow.set_experiment(experiment_name="BBBC_modeling")


with mlflow.start_run(run_name='b_log_rf_final'):
    model = RandomForestClassifier(random_state=7)

    m1 = myModel('Log Random Forest',model)

    # make data
    
    '''
    
    Random forest is an ideal algorithm to deal with the extreme imbalance owing to two main reasons. 
    Firstly, the ability to incorporate class weights into the random forest classifier makes it cost-sensitive; 
    hence it penalizes misclassifying the minority class. 
    
    '''
    
    Xtrain,Xtest,ytrain,ytest = m1.make_data("full", "before", _)
    print("Data Loaded")
    print("Building Model...")
    
     # log transform data
    Xtrain, Xtest = m1.log_transform(['Frequency', 'First_purchase'])

       
    # make model
    params = {'n_estimators': [25,50,100,200,500],
                 'max_depth': [2,5,8, 50]}
    log_rf_model, log_rf_scores = m1.fit_model(model, ['f1','accuracy', 'precision', 'recall'], tuning=True, params=params)
    
 
    #signature?
    signature = infer_signature(Xtest, log_rf_model.predict(Xtest))
    
    #Build the Evaluation Dataset from the test set
    eval_data = Xtest
    eval_data["label"] = ytest
    
    undersamp = 0
    oversamp = 0
 
    
    log_rf_params = log_rf_model.get_params()
    
    # # log parameters
    mlflow.log_param('n_estimators', log_rf_params['n_estimators'])
    mlflow.log_param('max_depth', log_rf_params['max_depth'])
   
   
    
    
    # log metrics
    mlflow.log_metric('accuracy', mean(log_rf_scores['test_accuracy']))
    mlflow.log_metric('f1', mean(log_rf_scores['test_f1']))
    mlflow.log_metric('precision', mean(log_rf_scores['test_precision']))
    mlflow.log_metric('recall', mean(log_rf_scores['test_recall']))
    mlflow.log_metric('oversample', oversamp)
    mlflow.log_metric('undersample', undersamp)

    # log model
    mlflow.sklearn.log_model(log_rf_model, "log rf model")
    model_uri = mlflow.get_artifact_uri("log rf model")
    print("Model Loaded")
    
    
    explainer = shap.Explainer(log_rf_model, Xtest)
    shap_values = explainer(Xtest)
    shap_exp = shap_values
    
    #evaluate model
    result = mlflow.evaluate(
        model_uri,
        eval_data,
        targets="label",
        model_type="classifier",
        evaluators="default",
        evaluator_config= {"log_model_explainability":False} # error called when creating beeswarm plot,
    )
    
    # save results in json
    result.save("")
    print("Metrics Saved")

    
mlflow.end_run()

Data Loaded
Average Test Accuracy Score: 0.8743589743589745

Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.

Model Loaded

2023/06/13 14:37:09 WARNING mlflow.pyfunc: Detected one or more mismatches between the model's dependencies and the current Python environment:
 - mlflow (current: 2.3.2, required: mlflow==2.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
2023/06/13 14:37:09 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/06/13 14:37:09 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.

<Figure size 1050x700 with 0 Axes>


# no improvement with log transformattion

mlflow.set_tracking_uri("")

mlflow.set_experiment(experiment_name="BBBC_modeling")


with mlflow.start_run(run_name='b_log_xgb_final'):
    model = xgb.XGBClassifier(random_state=7,
                             eta=.2,
                             max_depth=3,
                             gamma=1,
                             subsample=1)

    m1 = myModel('Log XGB Model',model)

    # make data
    Xtrain,Xtest,ytrain,ytest = m1.make_data("full", "before", _)
    print("Data Loaded")
    print("Building Model...")
    
     # log transform data
    Xtrain, Xtest = m1.log_transform(['Frequency', 'First_purchase'])
  

    # make model
    params = {'eta' : [0.2],
          'max_depth': [3],
          'gamma': [0 , .1],
          'subsample': [1]}
    log_tuned_xgb_model, log_tuned_xgb_scores = m1.fit_model(model, ['f1','accuracy', 'precision', 'recall'], tuning=False, params=params)
    
    #signature?
    signature = infer_signature(Xtest, log_tuned_xgb_model.predict(Xtest))
    
    #Build the Evaluation Dataset from the test set
    eval_data = Xtest
    eval_data["label"] = ytest
    undersample = 0
    oversample = 0
    
    
    log_xgb_params = log_tuned_xgb_model.get_params()
    
    # log parameters
    mlflow.log_param('eta', log_xgb_params['eta'])
    mlflow.log_param('max_depth', log_xgb_params['max_depth'])
    mlflow.log_param('gamma', log_xgb_params['gamma'])
    mlflow.log_param('subsample', log_xgb_params['subsample'])
   
    
    
    # log metrics
    mlflow.log_metric('accuracy', mean(log_tuned_xgb_scores['test_accuracy']))
    mlflow.log_metric('f1', mean(log_tuned_xgb_scores['test_f1']))
    mlflow.log_metric('precision', mean(log_tuned_xgb_scores['test_precision']))
    mlflow.log_metric('recall', mean(log_tuned_xgb_scores['test_recall']))
    mlflow.log_metric('oversample', oversample)
    mlflow.log_metric('undersample', undersample)

    # log model
    mlflow.sklearn.log_model(log_tuned_xgb_model, "xgb model")
    model_uri = mlflow.get_artifact_uri("xgb model")
    print("Model Loaded")

    
    # evaluate model
    result = mlflow.evaluate(
        model_uri,
        eval_data,
        targets="label",
        model_type="classifier",
        evaluators=["default"],
    )
    
    # save results in json
    result.save("")
    print("Metrics Saved")


    
mlflow.end_run()

Data Loaded
Building Model...
Average Test Accuracy Score: 0.8743589743589745

/Users/coolkid/anaconda3/lib/python3.10/site-packages/mlflow/models/signature.py:137: UserWarning:

Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.

2023/06/13 16:27:44 WARNING mlflow.pyfunc: Detected one or more mismatches between the model's dependencies and the current Python environment:
 - mlflow (current: 2.3.2, required: mlflow==2.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
2023/06/13 16:27:44 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/06/13 16:27:44 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.

Model Loaded

2023/06/13 16:27:44 INFO mlflow.models.evaluation.default_evaluator: Shap explainer Tree is used.
/Users/coolkid/anaconda3/lib/python3.10/site-packages/mlflow/shap.py:459: UserWarning:

Unable to serialize underlying model using MLflow, will use SHAP serialization

2023/06/13 16:27:45 WARNING mlflow.models.evaluation.default_evaluator: Logging explainer failed. Reason: AttributeError("'TreeEnsemble' object has no attribute 'save'"). Set logging level to DEBUG to see the full traceback.
/Users/coolkid/anaconda3/lib/python3.10/site-packages/shap/plots/_beeswarm.py:340: UserWarning:

No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored

/Users/coolkid/anaconda3/lib/python3.10/site-packages/shap/plots/_beeswarm.py:664: UserWarning:

No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored

Metrics Saved


mlflow.set_tracking_uri("")

mlflow.set_experiment(experiment_name="BBBC_modeling")


with mlflow.start_run(run_name='b_log_catboost_final'):
    
    
    params = {'iterations':[2,4],
            'depth': [6,7,8,9],
             'bagging_temperature': [0, 10, 50]}


    model = CatBoostClassifier(learning_rate=1,
                           loss_function='Logloss',
                           verbose=False,
                           random_state=7)

    m1 = myModel('Log CatBoost Model',model)

    # make data
    Xtrain,Xtest,ytrain,ytest = m1.make_data("full", "before", _)
    print("Data Loaded")
    print("Building Model...")
    
     # log transform data
    Xtrain, Xtest = m1.log_transform(['Frequency', 'First_purchase'])
    
    log_catboost_model, log_catboost_scores = m1.fit_model(model, ['f1','accuracy', 'precision', 'recall'], tuning=True, params=params)
    
    log_catboost_params = log_catboost_model.get_params()

    
    #signature?
    signature = infer_signature(Xtest, log_catboost_model.predict(Xtest))
    
    #Build the Evaluation Dataset from the test set
    eval_data = Xtest
    eval_data["label"] = ytest
    undersample = 0
    oversample = 0
    
    
     # log parameters
    mlflow.log_param('iterations', log_catboost_params['iterations'])
    mlflow.log_param('learning_rate', log_catboost_params['learning_rate'])
    mlflow.log_param('depth', log_catboost_params['depth'])
    mlflow.log_param('bagging_temperature', log_catboost_params['bagging_temperature'])
    mlflow.log_param('loss_function', log_catboost_params['loss_function'])
    
    # log metrics
    mlflow.log_metric('accuracy', mean(log_catboost_scores['test_accuracy']))
    mlflow.log_metric('f1', mean(log_catboost_scores['test_f1']))
    mlflow.log_metric('precision', mean(log_catboost_scores['test_precision']))
    mlflow.log_metric('recall', mean(log_catboost_scores['test_recall']))
    mlflow.log_metric('oversample', oversample)
    mlflow.log_metric('undersample', undersample)
     
      

    # log model
    mlflow.sklearn.log_model(log_catboost_model, "log_catboost model")
    model_uri = mlflow.get_artifact_uri("log_catboost model")
    print("Model Loaded")

    
    # evaluate model
    result = mlflow.evaluate(
        model_uri,
        eval_data,
        targets="label",
        model_type="classifier",
        evaluators=["default"],
    )
    
    # save results in json
    result.save("")
    print("Metrics Saved")


    
mlflow.end_run()

Data Loaded

Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.

Average Test Accuracy Score: 0.8666666666666666

Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.
2023/06/13 14:37:19 WARNING mlflow.pyfunc: Detected one or more mismatches between the model's dependencies and the current Python environment:
 - mlflow (current: 2.3.2, required: mlflow==2.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
2023/06/13 14:37:19 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/06/13 14:37:19 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2023/06/13 14:37:19 WARNING mlflow.models.evaluation.default_evaluator: Computing sklearn model score failed: TypeError("CatBoostClassifier.score() got an unexpected keyword argument 'sample_weight'"). Set logging level to DEBUG to see the full traceback.

Model Loaded

2023/06/13 14:37:20 INFO mlflow.models.evaluation.default_evaluator: Shap explainer Tree is used.
Unable to serialize underlying model using MLflow, will use SHAP serialization
2023/06/13 14:37:20 WARNING mlflow.models.evaluation.default_evaluator: Logging explainer failed. Reason: AttributeError("'TreeEnsemble' object has no attribute 'save'"). Set logging level to DEBUG to see the full traceback.
No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored
No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored


importances = rf_model.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf_model.estimators_], axis=0)
feature_names = [i for i in rf_model.feature_names_in_]
forest_importances = pd.Series(importances, index=feature_names)

fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=std, ax=ax)
ax.set_title("Feature importances")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()


# significant improvement after slim trasnformation

mlflow.set_tracking_uri("")

mlflow.set_experiment(experiment_name="BBBC_modeling")


with mlflow.start_run(run_name='a_slim_log_logit_final'):
    
    model = LogisticRegression(random_state=7, max_iter=1000)

    m1 = myModel('Slim Log Logistic Regression',model)
    
   
    # make data
    Xtrain,Xtest,ytrain,ytest = m1.make_data("slim", "after", ['P_Art', 'Amount_purchased','Frequency', 'First_purchase'])
    print("Data Loaded")
    print("Building Model...")
    
     # log transform data
    Xtrain, Xtest = m1.log_transform(['Frequency', 'First_purchase'])

    # make model
    slim_log_logit_model,slim_log_logit_scores = m1.fit_model(model, ['f1','accuracy', 'precision', 'recall'], tuning=False)
    
    #signature?
    signature = infer_signature(Xtest, slim_log_logit_model.predict(Xtest))
    
    #Build the Evaluation Dataset from the test set
    eval_data = Xtest
    eval_data["label"] = ytest
    
    undersamp = .2
    oversamp = .3
    
    # log metrics
    mlflow.log_metric('accuracy', mean(slim_log_logit_scores['test_accuracy']))
    mlflow.log_metric('f1', mean(slim_log_logit_scores['test_f1']))
    mlflow.log_metric('precision', mean(slim_log_logit_scores['test_precision']))
    mlflow.log_metric('recall', mean(slim_log_logit_scores['test_recall']))
    mlflow.log_metric('oversample', oversamp)
    mlflow.log_metric('undersample', undersamp)
    

    # log model
    mlflow.sklearn.log_model(slim_log_logit_model, "slim_log_logit model after002003")
    model_uri = mlflow.get_artifact_uri("slim_log_logit model after002003")
    print("Model Loaded")
    
    # evaluate model
    result = mlflow.evaluate(
        model_uri,
        eval_data,
        targets="label",
        model_type="classifier",
        evaluators=["default"],
    )
    # save results in json
    result.save("")
    print("Metrics Saved")


    
mlflow.end_run()

Data Loaded
Building Model...
Average Test Accuracy Score: 0.8846153846153847

/Users/coolkid/anaconda3/lib/python3.10/site-packages/mlflow/models/signature.py:137: UserWarning:

Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.

2023/06/13 16:25:54 WARNING mlflow.pyfunc: Detected one or more mismatches between the model's dependencies and the current Python environment:
 - mlflow (current: 2.3.2, required: mlflow==2.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
2023/06/13 16:25:54 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/06/13 16:25:54 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.

Model Loaded

2023/06/13 16:25:54 INFO mlflow.models.evaluation.default_evaluator: Shap explainer Linear is used.
/Users/coolkid/anaconda3/lib/python3.10/site-packages/mlflow/shap.py:459: UserWarning:

Unable to serialize underlying model using MLflow, will use SHAP serialization

2023/06/13 16:25:54 WARNING mlflow.models.evaluation.default_evaluator: Logging explainer failed. Reason: AttributeError("'LogisticRegression' object has no attribute 'save'"). Set logging level to DEBUG to see the full traceback.
/Users/coolkid/anaconda3/lib/python3.10/site-packages/shap/plots/_beeswarm.py:340: UserWarning:

No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored

/Users/coolkid/anaconda3/lib/python3.10/site-packages/shap/plots/_beeswarm.py:664: UserWarning:

No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored

Metrics Saved


# no significant improvment with slim transformation

mlflow.set_tracking_uri("")

mlflow.set_experiment(experiment_name="BBBC_modeling")


with mlflow.start_run(run_name='b_slim_rf_model_final'):
    model = RandomForestClassifier(random_state=7)

    m1 = myModel('Slim Random Forest',model)

    # make data
    
    '''
    
    Random forest is an ideal algorithm to deal with the extreme imbalance owing to two main reasons. 
    Firstly, the ability to incorporate class weights into the random forest classifier makes it cost-sensitive; 
    hence it penalizes misclassifying the minority class. 
    
    '''
    
    Xtrain,Xtest,ytrain,ytest = m1.make_data("slim", "before",  ['P_Art', 'Amount_purchased','Frequency', 'First_purchase'])
    print("Data Loaded")
    print("Building Model...")
    
       
    # make model
    params = {'n_estimators': [25,50,100,200,500],
                 'max_depth': [2,5,8, 50]}
    slim_rf_model, slim_rf_scores = m1.fit_model(model, ['f1','accuracy', 'precision', 'recall'], tuning=True, params=params)
    
    
    #signature?
    signature = infer_signature(Xtest, slim_rf_model.predict(Xtest))
    
    #Build the Evaluation Dataset from the test set
    eval_data = Xtest
    eval_data["label"] = ytest
    
    undersamp = 0
    oversamp = 0
 
    
    slim_rf_params = slim_rf_model.get_params()
    
    # # log parameters
    mlflow.log_param('n_estimators', slim_rf_params['n_estimators'])
    mlflow.log_param('max_depth', slim_rf_params['max_depth'])
   
   
    
    
    # log metrics
    mlflow.log_metric('accuracy', mean(slim_rf_scores['test_accuracy']))
    mlflow.log_metric('f1', mean(slim_rf_scores['test_f1']))
    mlflow.log_metric('precision', mean(slim_rf_scores['test_precision']))
    mlflow.log_metric('recall', mean(slim_rf_scores['test_recall']))
    mlflow.log_metric('oversample', oversamp)
    mlflow.log_metric('undersample', undersamp)

    # log model
    mlflow.sklearn.log_model(slim_rf_model, "slim_log rf model")
    model_uri = mlflow.get_artifact_uri("slim_log rf model")
    print("Model Loaded")
    
    
    explainer = shap.Explainer(slim_rf_model, Xtest)
    shap_values = explainer(Xtest)
    shap_exp = shap_values
    
    #evaluate model
    result = mlflow.evaluate(
        model_uri,
        eval_data,
        targets="label",
        model_type="classifier",
        evaluators="default",
        evaluator_config= {"log_model_explainability":False} # error called when creating beeswarm plot,
    )
    # save results in json
    result.save("")
    print("Metrics Saved")


    
mlflow.end_run()

Data Loaded
Average Test Accuracy Score: 0.8705128205128204

Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.

Model Loaded

 99%|===================| 1538/1560 [00:18<00:00]        2023/06/13 14:38:56 WARNING mlflow.pyfunc: Detected one or more mismatches between the model's dependencies and the current Python environment:
 - mlflow (current: 2.3.2, required: mlflow==2.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
2023/06/13 14:38:56 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/06/13 14:38:56 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.

<Figure size 1050x700 with 0 Axes>


# worse with slim transformation

mlflow.set_tracking_uri("")

mlflow.set_experiment(experiment_name="BBBC_modeling")


with mlflow.start_run(run_name='b_slim_xgb_final'):
    model = xgb.XGBClassifier(random_state=7,
                             eta=.2,
                             max_depth=3,
                             gamma=1,
                             subsample=1)

    m1 = myModel('Slim XGB Model',model)

    # make data
    Xtrain,Xtest,ytrain,ytest = m1.make_data("slim", "before", ['P_Art', 'Amount_purchased','Frequency', 'First_purchase'])
    print("Data Loaded")
    print("Building Model...")
     

    # make model
    params = {'eta' : [0.01, 0.2],
          'max_depth': [3],
          'gamma': [0 , .1],
          'subsample': [0.5, 1]}
    
    slim_tuned_xgb_model, slim_tuned_xgb_scores = m1.fit_model(model, ['f1','accuracy', 'precision', 'recall'], tuning=False, params=params)

    #signature?
    signature = infer_signature(Xtest, slim_tuned_xgb_model.predict(Xtest))
    
    #Build the Evaluation Dataset from the test set
    eval_data = Xtest
    eval_data["label"] = ytest
    undersample = 0
    oversample = 0
    
    
    slim_xgb_params = slim_tuned_xgb_model.get_params()
    
    # log parameters
    mlflow.log_param('eta', slim_xgb_params['eta'])
    mlflow.log_param('max_depth', slim_xgb_params['max_depth'])
    mlflow.log_param('gamma', slim_xgb_params['gamma'])
    mlflow.log_param('subsample', slim_xgb_params['subsample'])
   
    
    
    # log metrics
    mlflow.log_metric('accuracy', mean(slim_tuned_xgb_scores['test_accuracy']))
    mlflow.log_metric('f1', mean(slim_tuned_xgb_scores['test_f1']))
    mlflow.log_metric('precision', mean(slim_tuned_xgb_scores['test_precision']))
    mlflow.log_metric('recall', mean(slim_tuned_xgb_scores['test_recall']))
    mlflow.log_metric('oversample', oversample)
    mlflow.log_metric('undersample', undersample)

    # log model
    mlflow.sklearn.log_model(slim_tuned_xgb_model, "slim_xgb model")
    model_uri = mlflow.get_artifact_uri("slim_xgb model")
    print("Model Loaded")

    
    # evaluate model
    result = mlflow.evaluate(
        model_uri,
        eval_data,
        targets="label",
        model_type="classifier",
        evaluators=["default"],
    )
    
    # save results in json
    result.save("")
    print("Metrics Saved")


    
mlflow.end_run()

Data Loaded
Average Test Accuracy Score: 0.8807692307692306

Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.
2023/06/13 14:38:59 WARNING mlflow.pyfunc: Detected one or more mismatches between the model's dependencies and the current Python environment:
 - mlflow (current: 2.3.2, required: mlflow==2.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
2023/06/13 14:38:59 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/06/13 14:38:59 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.

Model Loaded

2023/06/13 14:39:00 INFO mlflow.models.evaluation.default_evaluator: Shap explainer Tree is used.
Unable to serialize underlying model using MLflow, will use SHAP serialization
2023/06/13 14:39:00 WARNING mlflow.models.evaluation.default_evaluator: Logging explainer failed. Reason: AttributeError("'TreeEnsemble' object has no attribute 'save'"). Set logging level to DEBUG to see the full traceback.
No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored
No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored


# worse with slim transformation

mlflow.set_tracking_uri("")

mlflow.set_experiment(experiment_name="BBBC_modeling")


with mlflow.start_run(run_name='b_slim_catboost_final'):
    
    
    params = {'iterations':[2,4, 8, 20, 50],
            'depth': [6,7,8,9]}


    model = CatBoostClassifier(bagging_temperature=0,
                               learning_rate=1,
                           loss_function='Logloss',
                           verbose=False,
                           random_state=7)

    m1 = myModel('Slim CatBoost Model',model)

    # make data
    Xtrain,Xtest,ytrain,ytest = m1.make_data("slim", "before", ['P_Art', 'Amount_purchased','Frequency', 'First_purchase'])
    print("Data Loaded")
    print("Building Model...")
    
    slim_catboost_model, slim_catboost_scores = m1.fit_model(model, ['f1','accuracy', 'precision', 'recall'], tuning=True, params=params)
    
    slim_catboost_params = slim_catboost_model.get_params()
   
    #signature?
    signature = infer_signature(Xtest, slim_catboost_model.predict(Xtest))
    
    #Build the Evaluation Dataset from the test set
    eval_data = Xtest
    eval_data["label"] = ytest
    undersample = 0
    oversample = 0
    
    
     # log parameters
    mlflow.log_param('iterations', slim_catboost_params['iterations'])
    mlflow.log_param('learning_rate', slim_catboost_params['learning_rate'])
    mlflow.log_param('depth', slim_catboost_params['depth'])
    mlflow.log_param('bagging_temperature', slim_catboost_params['bagging_temperature'])
    mlflow.log_param('loss_function', slim_catboost_params['loss_function'])
    
    # log metrics
    mlflow.log_metric('accuracy', mean(slim_catboost_scores['test_accuracy']))
    mlflow.log_metric('f1', mean(slim_catboost_scores['test_f1']))
    mlflow.log_metric('precision', mean(slim_catboost_scores['test_precision']))
    mlflow.log_metric('recall', mean(slim_catboost_scores['test_recall']))
    mlflow.log_metric('oversample', oversample)
    mlflow.log_metric('undersample', undersample)
     
      

    # log model
    mlflow.sklearn.log_model(slim_catboost_model, "slim_catboost model")
    model_uri = mlflow.get_artifact_uri("slim_catboost model")
    print("Model Loaded")

    
    # evaluate model
    result = mlflow.evaluate(
        model_uri,
        eval_data,
        targets="label",
        model_type="classifier",
        evaluators=["default"],
    )

    # save results in json
    result.save("")
    print("Metrics Saved")

    
mlflow.end_run()

Data Loaded
Average Test Accuracy Score: 0.873076923076923

Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.
2023/06/13 14:39:11 WARNING mlflow.pyfunc: Detected one or more mismatches between the model's dependencies and the current Python environment:
 - mlflow (current: 2.3.2, required: mlflow==2.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
2023/06/13 14:39:11 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/06/13 14:39:11 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2023/06/13 14:39:11 WARNING mlflow.models.evaluation.default_evaluator: Computing sklearn model score failed: TypeError("CatBoostClassifier.score() got an unexpected keyword argument 'sample_weight'"). Set logging level to DEBUG to see the full traceback.

Model Loaded

2023/06/13 14:39:12 INFO mlflow.models.evaluation.default_evaluator: Shap explainer Tree is used.
Unable to serialize underlying model using MLflow, will use SHAP serialization
2023/06/13 14:39:12 WARNING mlflow.models.evaluation.default_evaluator: Logging explainer failed. Reason: AttributeError("'TreeEnsemble' object has no attribute 'save'"). Set logging level to DEBUG to see the full traceback.
No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored
No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored


mlflow.set_tracking_uri("")

mlflow.set_experiment(experiment_name="BBBC_modeling")


with mlflow.start_run(run_name='ensv1_final'):
    
    
    #ens v1
    # estimators=[('logit', logit_model), ('rf_tuned', rf_model), ('xgb', tuned_xgb_model)]
    
    # ens v2
    estimators=[('logit', logit_model), ('rf_tuned', rf_model), ('xgb', tuned_xgb_model), ('tuned_sample_catboost', catboost_model)]


    model = VotingClassifier(estimators, voting='hard')

    m1 = myModel('EnsV1 Model',model)

    # make data
    Xtrain,Xtest,ytrain,ytest = m1.make_data("full", "before", _)
    print("Data Loaded")
    print("Building Model...")
    
    ensemble_v1_model, ensemble_v1_scores = m1.fit_model(model, ['f1','accuracy', 'precision', 'recall'], tuning=False)

    
    #signature?
    signature = infer_signature(Xtest, ensemble_v1_model.predict(Xtest))
    
    #Build the Evaluation Dataset from the test set
    eval_data = Xtest
    eval_data["label"] = ytest
    undersample = 0
    oversample = 0
    models = {'modl1': 'logit',
             'modl2': 'rf_tuned',
             'modl3': 'xgb',
             'modl4': 'tuned_sample_catboost'}
    
    # log params
    mlflow.log_params(models)
    
    
    # log metrics
    mlflow.log_metric('accuracy', mean(ensemble_v1_scores['test_accuracy']))
    mlflow.log_metric('f1', mean(ensemble_v1_scores['test_f1']))
    mlflow.log_metric('precision', mean(ensemble_v1_scores['test_precision']))
    mlflow.log_metric('recall', mean(ensemble_v1_scores['test_recall']))
    mlflow.log_metric('oversample', oversample)
    mlflow.log_metric('undersample', undersample)

    # log model
    mlflow.sklearn.log_model(ensemble_v1_model, "ens1 model")
    model_uri = mlflow.get_artifact_uri("ens1 model")
    print("Model Loaded")

    
    # evaluate model
    result = mlflow.evaluate(
        model_uri,
        eval_data,
        targets="label",
        model_type="classifier",
        evaluators=["default"],
    )
    
    # save results in json
    result.save("")
    print("Metrics Saved")


    
mlflow.end_run()

Data Loaded
Building Model...

/Users/coolkid/anaconda3/lib/python3.10/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning:

Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.

Average Test Accuracy Score: 0.8756410256410255

/Users/coolkid/anaconda3/lib/python3.10/site-packages/mlflow/models/signature.py:137: UserWarning:

Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.

2023/06/13 16:30:05 WARNING mlflow.pyfunc: Detected one or more mismatches between the model's dependencies and the current Python environment:
 - mlflow (current: 2.3.2, required: mlflow==2.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
2023/06/13 16:30:05 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/06/13 16:30:05 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.

Model Loaded

2023/06/13 16:30:05 WARNING mlflow.models.evaluation.default_evaluator: Shap evaluation failed. Reason: TypeError("The passed model is not callable and cannot be analyzed directly with the given masker! Model: VotingClassifier(estimators=[('logit',\n                              LogisticRegression(max_iter=1000,\n                                                 random_state=7)),\n                             ('rf_tuned',\n                              RandomForestClassifier(max_depth=8,\n                                                     n_estimators=200,\n                                                     random_state=7)),\n                             ('xgb',\n                              XGBClassifier(base_score=None, booster=None,\n                                            callbacks=None,\n                                            colsample_bylevel=None,\n                                            colsample_bynode=None,\n                                            colsample_bytree=None,\n                                            early_stopping_rounds=None,\n                                            enable_categorical=Fal...\n                                            interaction_constraints=None,\n                                            learning_rate=None, max_bin=None,\n                                            max_cat_threshold=None,\n                                            max_cat_to_onehot=None,\n                                            max_delta_step=None, max_depth=3,\n                                            max_leaves=None,\n                                            min_child_weight=None, missing=nan,\n                                            monotone_constraints=None,\n                                            n_estimators=100, n_jobs=None,\n                                            num_parallel_tree=None,\n                                            predictor=None, ...)),\n                             ('tuned_sample_catboost',\n                              <catboost.core.CatBoostClassifier object at 0x17a8ae560>)])"). Set logging level to DEBUG to see the full traceback.

Metrics Saved

<Figure size 1050x700 with 0 Axes>


mlflow.set_tracking_uri("")

mlflow.set_experiment(experiment_name="BBBC_modeling")


with mlflow.start_run(run_name='ensv2_final'):
    
    
    #ens v1
    # estimators=[('logit', logit_model), ('rf_tuned', rf_model), ('xgb', tuned_xgb_model)]
    
    # ens v2
    estimators=[('log logit', log_logit_model),('logit', logit_model), ('rf_tuned', rf_model), 
                ('xgb', tuned_xgb_model), ('tuned_sample_catboost', catboost_model)]


    model = VotingClassifier(estimators, voting='hard')

    m1 = myModel('EnsV2 Model',model)

    # make data
    Xtrain,Xtest,ytrain,ytest = m1.make_data("full", "before", _)
    print("Data Loaded")
    print("Building Model...")
    
    ensemble_v2_model, ensemble_v2_scores = m1.fit_model(model, ['f1','accuracy', 'precision', 'recall'], tuning=False)

    
    #signature?
    signature = infer_signature(Xtest, ensemble_v2_model.predict(Xtest))
    
    #Build the Evaluation Dataset from the test set
    eval_data = Xtest
    eval_data["label"] = ytest
    undersample = 0
    oversample = 0
    models = {'modl1': 'logit',
             'modl2': 'rf_tuned',
             'modl3': 'xgb',
             'modl4': 'tuned_sample_catboost',
             'modl5': 'log_logit'}
    
    # log params
    mlflow.log_params(models)
    
    
    # log metrics
    mlflow.log_metric('accuracy', mean(ensemble_v2_scores['test_accuracy']))
    mlflow.log_metric('f1', mean(ensemble_v2_scores['test_f1']))
    mlflow.log_metric('precision', mean(ensemble_v2_scores['test_precision']))
    mlflow.log_metric('recall', mean(ensemble_v2_scores['test_recall']))
    mlflow.log_metric('oversample', oversample)
    mlflow.log_metric('undersample', undersample)

    # log model
    mlflow.sklearn.log_model(ensemble_v2_model, "ens2 model")
    model_uri = mlflow.get_artifact_uri("ens2 model")
    print("Model Loaded")

    
    # evaluate model
    result = mlflow.evaluate(
        model_uri,
        eval_data,
        targets="label",
        model_type="classifier",
        evaluators=["default"],
    )


    
mlflow.end_run()

Data Loaded
Average Test Accuracy Score: 0.8756410256410255

Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.
2023/06/13 14:39:23 WARNING mlflow.pyfunc: Detected one or more mismatches between the model's dependencies and the current Python environment:
 - mlflow (current: 2.3.2, required: mlflow==2.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
2023/06/13 14:39:23 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/06/13 14:39:23 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.

Model Loaded

2023/06/13 14:39:23 WARNING mlflow.models.evaluation.default_evaluator: Shap evaluation failed. Reason: TypeError("The passed model is not callable and cannot be analyzed directly with the given masker! Model: VotingClassifier(estimators=[('log logit',\n                              LogisticRegression(max_iter=1000,\n                                                 random_state=7)),\n                             ('logit',\n                              LogisticRegression(max_iter=1000,\n                                                 random_state=7)),\n                             ('rf_tuned',\n                              RandomForestClassifier(max_depth=8,\n                                                     n_estimators=200,\n                                                     random_state=7)),\n                             ('xgb',\n                              XGBClassifier(base_score=None, booster=None,\n                                            callbacks=None,\n                                            colsample_bylevel=None,\n                                            colsample_bynode=None,\n                                            colsample...\n                                            interaction_constraints=None,\n                                            learning_rate=None, max_bin=None,\n                                            max_cat_threshold=None,\n                                            max_cat_to_onehot=None,\n                                            max_delta_step=None, max_depth=3,\n                                            max_leaves=None,\n                                            min_child_weight=None, missing=nan,\n                                            monotone_constraints=None,\n                                            n_estimators=100, n_jobs=None,\n                                            num_parallel_tree=None,\n                                            predictor=None, ...)),\n                             ('tuned_sample_catboost',\n                              <catboost.core.CatBoostClassifier object at 0x285186b60>)])"). Set logging level to DEBUG to see the full traceback.

<Figure size 1050x700 with 0 Axes>


mlflow.set_tracking_uri("")

mlflow.set_experiment(experiment_name="BBBC_modeling")


with mlflow.start_run(run_name='ensv3_final'):
    
    
    #ens v1
    # estimators=[('logit', logit_model), ('rf_tuned', rf_model), ('xgb', tuned_xgb_model)]
    
    # ens v2
    estimators=[('log logit', log_logit_model),('logit', logit_model), ('rf_tuned', rf_model), 
                ('xgb', tuned_xgb_model), ('tuned_sample_catboost', catboost_model), 
               ('slim log logit', slim_log_logit_model)]


    model = VotingClassifier(estimators, voting='hard')

    m1 = myModel('EnsV3 Model',model)

    # make data
    Xtrain,Xtest,ytrain,ytest = m1.make_data("full", "before", _)
    print("Data Loaded")
    print("Building Model...")
    
    ensemble_v3_model, ensemble_v3_scores = m1.fit_model(model, ['f1','accuracy', 'precision', 'recall'], tuning=False)
    
    #signature?
    signature = infer_signature(Xtest, ensemble_v3_model.predict(Xtest))
    
    #Build the Evaluation Dataset from the test set
    eval_data = Xtest
    eval_data["label"] = ytest
    undersample = 0
    oversample = 0
    models = {'modl1': 'logit',
             'modl2': 'rf_tuned',
             'modl3': 'xgb',
             'modl4': 'tuned_sample_catboost',
             'modl5': 'log_logit',
             'modl6': 'slim log logit'}
    
    # log params
    mlflow.log_params(models)
    
    
    # log metrics
    mlflow.log_metric('accuracy', mean(ensemble_v3_scores['test_accuracy']))
    mlflow.log_metric('f1', mean(ensemble_v3_scores['test_f1']))
    mlflow.log_metric('precision', mean(ensemble_v3_scores['test_precision']))
    mlflow.log_metric('recall', mean(ensemble_v3_scores['test_recall']))
    mlflow.log_metric('oversample', oversample)
    mlflow.log_metric('undersample', undersample)

    # log model
    mlflow.sklearn.log_model(ensemble_v3_model, "ens3 model")
    model_uri = mlflow.get_artifact_uri("ens3 model")
    print("Model Loaded")

    
    # evaluate model
    result = mlflow.evaluate(
        model_uri,
        eval_data,
        targets="label",
        model_type="classifier",
        evaluators=["default"],
    )
    
    

    
mlflow.end_run()

Data Loaded
Average Test Accuracy Score: 0.8794871794871796

/Users/coolkid/anaconda3/lib/python3.10/site-packages/mlflow/models/signature.py:137: UserWarning:

Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.

2023/06/13 15:33:16 WARNING mlflow.pyfunc: Detected one or more mismatches between the model's dependencies and the current Python environment:
 - mlflow (current: 2.3.2, required: mlflow==2.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
2023/06/13 15:33:16 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/06/13 15:33:16 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.

Model Loaded

2023/06/13 15:33:16 WARNING mlflow.models.evaluation.default_evaluator: Shap evaluation failed. Reason: TypeError("The passed model is not callable and cannot be analyzed directly with the given masker! Model: VotingClassifier(estimators=[('log logit',\n                              LogisticRegression(max_iter=1000,\n                                                 random_state=7)),\n                             ('logit',\n                              LogisticRegression(max_iter=1000,\n                                                 random_state=7)),\n                             ('rf_tuned',\n                              RandomForestClassifier(max_depth=8,\n                                                     n_estimators=200,\n                                                     random_state=7)),\n                             ('xgb',\n                              XGBClassifier(base_score=None, booster=None,\n                                            callbacks=None,\n                                            colsample_bylevel=None,\n                                            colsample_bynode=None,\n                                            colsample...\n                                            max_cat_to_onehot=None,\n                                            max_delta_step=None, max_depth=3,\n                                            max_leaves=None,\n                                            min_child_weight=None, missing=nan,\n                                            monotone_constraints=None,\n                                            n_estimators=100, n_jobs=None,\n                                            num_parallel_tree=None,\n                                            predictor=None, ...)),\n                             ('tuned_sample_catboost',\n                              <catboost.core.CatBoostClassifier object at 0x28698aa40>),\n                             ('slim log logit',\n                              LogisticRegression(max_iter=1000,\n                                                 random_state=7))])"). Set logging level to DEBUG to see the full traceback.

<Figure size 1050x700 with 0 Axes>


mlflow.set_tracking_uri("")

mlflow.set_experiment(experiment_name="BBBC_modeling")


with mlflow.start_run(run_name='ensv4_final'):
    
    
    #ens v1
    # estimators=[('logit', logit_model), ('rf_tuned', rf_model), ('xgb', tuned_xgb_model)]
    
    # ens v2
    estimators=[('log logit', log_logit_model),('logit', logit_model), ('rf_tuned', rf_model), 
                ('xgb', tuned_xgb_model), ('tuned_sample_catboost', catboost_model), 
               ('slim log logit', slim_log_logit_model), ('log_xgb', log_tuned_xgb_model)]


    model = VotingClassifier(estimators, voting='hard')

    m1 = myModel('EnsV4 Model',model)

    # make data
    Xtrain,Xtest,ytrain,ytest = m1.make_data("full", "before", _)
    print("Data Loaded")
    print("Building Model...")
    
    ensemble_v4_model, ensemble_v4_scores = m1.fit_model(model, ['f1','accuracy', 'precision', 'recall'], tuning=False)
    
    #signature?
    signature = infer_signature(Xtest, ensemble_v4_model.predict(Xtest))
    
    #Build the Evaluation Dataset from the test set
    eval_data = Xtest
    eval_data["label"] = ytest
    undersample = 0
    oversample = 0
    models = {'modl1': 'logit',
             'modl2': 'rf_tuned',
             'modl3': 'xgb',
             'modl4': 'tuned_sample_catboost',
             'modl5': 'log_logit',
             'modl6': 'slim log logit',
             'modl7': 'log_xgb'}
    
    # log params
    mlflow.log_params(models)
    
    
    # log metrics
    mlflow.log_metric('accuracy', mean(ensemble_v4_scores['test_accuracy']))
    mlflow.log_metric('f1', mean(ensemble_v4_scores['test_f1']))
    mlflow.log_metric('precision', mean(ensemble_v4_scores['test_precision']))
    mlflow.log_metric('recall', mean(ensemble_v4_scores['test_recall']))
    mlflow.log_metric('oversample', oversample)
    mlflow.log_metric('undersample', undersample)

    # log model
    mlflow.sklearn.log_model(ensemble_v4_model, "ens4 model")
    model_uri = mlflow.get_artifact_uri("ens4 model")
    print("Model Loaded")

    
    # evaluate model
    result = mlflow.evaluate(
        model_uri,
        eval_data,
        targets="label",
        model_type="classifier",
        evaluators=["default"],
    )
    
    # save results in json
    result.save("")
    print("Metrics Saved")


    
mlflow.end_run()

Data Loaded
Building Model...
Average Test Accuracy Score: 0.8756410256410255

/Users/coolkid/anaconda3/lib/python3.10/site-packages/mlflow/models/signature.py:137: UserWarning:

Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.

2023/06/13 16:32:35 WARNING mlflow.pyfunc: Detected one or more mismatches between the model's dependencies and the current Python environment:
 - mlflow (current: 2.3.2, required: mlflow==2.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
2023/06/13 16:32:35 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/06/13 16:32:35 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.

Model Loaded

2023/06/13 16:32:36 WARNING mlflow.models.evaluation.default_evaluator: Shap evaluation failed. Reason: TypeError("The passed model is not callable and cannot be analyzed directly with the given masker! Model: VotingClassifier(estimators=[('log logit',\n                              LogisticRegression(max_iter=1000,\n                                                 random_state=7)),\n                             ('logit',\n                              LogisticRegression(max_iter=1000,\n                                                 random_state=7)),\n                             ('rf_tuned',\n                              RandomForestClassifier(max_depth=8,\n                                                     n_estimators=200,\n                                                     random_state=7)),\n                             ('xgb',\n                              XGBClassifier(base_score=None, booster=None,\n                                            callbacks=None,\n                                            colsample_bylevel=None,\n                                            colsample_bynode=None,\n                                            colsample...\n                                            feature_types=None, gamma=1,\n                                            gpu_id=None, grow_policy=None,\n                                            importance_type=None,\n                                            interaction_constraints=None,\n                                            learning_rate=None, max_bin=None,\n                                            max_cat_threshold=None,\n                                            max_cat_to_onehot=None,\n                                            max_delta_step=None, max_depth=3,\n                                            max_leaves=None,\n                                            min_child_weight=None, missing=nan,\n                                            monotone_constraints=None,\n                                            n_estimators=100, n_jobs=None,\n                                            num_parallel_tree=None,\n                                            predictor=None, ...))])"). Set logging level to DEBUG to see the full traceback.

Metrics Saved

<Figure size 1050x700 with 0 Axes>


# Loading json files from mlflow evaluation
logit_results = json_to_df("logit_metrics.json", "Logistic Regression")
log_logit_results = json_to_df("log_logit_metrics.json", "Log Logistic Regression")
slim_log_logit_results = json_to_df("slim_log_logit_metrics.json", "Slim Log Logistic Regression")
rf_results = json_to_df("rf_metrics.json", "Random Forest")
tuned_xgb_results = json_to_df("xgb_metrics.json", "XGB")
catboost_results = json_to_df("catboost_metrics.json", "CatBoost")
log_tuned_xgb_results = json_to_df("log_xgb_metrics.json", "Log XGB")
ensv1_results = json_to_df("ensv1_metrics.json", "EnsV1")
ensv4_results = json_to_df("ensv4_metrics.json", "EnsV4")


# '''
# TODO concatenate all data frames together
'''

visualize:
    1. Precision on 1 plus others (recall/f1?) for each model
    2. ROC curve for each model
    3. Test Accuracy for each model
    4. preds for 1 for each model
'''    

data_df = pd.concat([logit_results,log_logit_results,slim_log_logit_results,
                    rf_results,tuned_xgb_results,catboost_results,log_tuned_xgb_results,
                    ensv1_results, ensv4_results])


data_df.drop_duplicates(inplace=True)


data_df


# pivoting data for easier reporting
viz_df = data_df

viz_df = viz_df[['Model', 'accuracy_score', 'precision_score', 'f1_score', 'recall_score']]
viz_df = viz_df.sort_values('accuracy_score', ascending=False)

viz_df.plot(x="Model",
           kind="bar",
           stacked=False,
           title = "Top Model Metric Comparison")

<Axes: title={'center': 'Top Model Metric Comparison'}, xlabel='Model'>


get_profit(data_df)

/var/folders/x8/mllw6tg55fs6mvzyfxq6kcwm0000gn/T/ipykernel_22052/1118740994.py:185: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

/var/folders/x8/mllw6tg55fs6mvzyfxq6kcwm0000gn/T/ipykernel_22052/1118740994.py:186: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

/var/folders/x8/mllw6tg55fs6mvzyfxq6kcwm0000gn/T/ipykernel_22052/1118740994.py:187: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


# ensv1 has greater precision on minority class


#No Values Were Significant so did not include in the analysis
# make data
Xtrain,Xtest,ytrain,ytest = m1.make_data("slim", "after", ['P_Art', 'Amount_purchased','Frequency', 'First_purchase'])
# Xtrain,Xtest,ytrain,ytest = m1.make_data("full", "before", _)

# log transform data
Xtrain, Xtest = m1.log_transform(['Frequency', 'First_purchase'])

print_model = sm.Logit(ytrain,Xtrain).fit()
print(print_model.summary())

# convert log odds inot more interpretabl odds
model_odds = pd.DataFrame(np.exp(print_model.params), columns=['OR'])
model_odds

Optimization terminated successfully.
         Current function value: 0.466430
         Iterations 6
                           Logit Regression Results                           
==============================================================================
Dep. Variable:                 Choice   No. Observations:                 3256
Model:                          Logit   Df Residuals:                     3252
Method:                           MLE   Df Model:                            3
Date:                Tue, 13 Jun 2023   Pseudo R-squ.:                  0.1363
Time:                        18:43:07   Log-Likelihood:                -1518.7
converged:                       True   LL-Null:                       -1758.4
Covariance Type:            nonrobust   LLR p-value:                1.327e-103
====================================================================================
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
P_Art                0.9561      0.069     13.833      0.000       0.821       1.092
Amount_purchased     0.0020      0.000      4.155      0.000       0.001       0.003
Frequency           -0.6888      0.075     -9.168      0.000      -0.836      -0.542
First_purchase      -0.2104      0.073     -2.877      0.004      -0.354      -0.067
====================================================================================


Xtrain,Xtest,ytrain,ytest = m1.make_data("full", "before", _)
preds = ensemble_v4_model.predict(Xtest)


df_preds = pd.DataFrame(ytest)
df_preds['ensv4']= preds
df_preds.reset_index(inplace=True)
df_preds.rename(columns={'index':'CustomerId'}, inplace=True) # if we were given customers ids in a real world situation this would be the index. 
df_preds


test = df_preds[df_preds['Choice']==1]
test_final = test[test['ensv4']==1]
test_final

	Gender	Amount_purchased	Frequency	Last_purchase	First_purchase	P_Child	P_Youth	P_Cook	P_DIY	P_Art
count	3120.000000	3120.000000	3120.000000	3120.000000	3120.000000	3120.000000	3120.000000	3120.000000	3120.000000	3120.000000
mean	0.670192	196.717949	12.815385	3.106410	22.627564	0.725000	0.347115	0.771795	0.394231	0.372436
std	0.470219	94.960142	8.052841	2.942795	15.906823	1.018906	0.644874	1.032576	0.692815	0.675507
min	0.000000	15.000000	2.000000	1.000000	2.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	0.000000	122.000000	6.000000	1.000000	12.000000	0.000000	0.000000	0.000000	0.000000	0.000000
50%	1.000000	199.000000	12.000000	2.000000	18.000000	0.000000	0.000000	0.000000	0.000000	0.000000
75%	1.000000	268.000000	16.000000	4.000000	30.000000	1.000000	1.000000	1.000000	1.000000	1.000000
max	1.000000	474.000000	36.000000	12.000000	96.000000	8.000000	5.000000	6.000000	4.000000	5.000000

	score	true_negatives	false_positives	false_negatives	true_positives	example_count	accuracy_score	recall_score	precision_score	f1_score	log_loss	roc_auc	precision_recall_auc	Model
1	0.865385	638	39	66	37	780	0.865385	0.359223	0.486842	0.413408	0.341163	0.782292	0.439211	Logistic Regression
1	0.873077	644	33	66	37	780	0.873077	0.359223	0.528571	0.427746	0.337587	0.785131	0.440876	Log Logistic Regression
1	0.871795	652	25	75	28	780	0.871795	0.271845	0.528302	0.358974	0.365266	0.735842	0.400491	Slim Log Logistic Regression
1	0.879487	665	12	82	21	780	0.879487	0.203883	0.636364	0.308824	0.335726	0.751846	0.416268	Random Forest
1	0.871795	657	20	80	23	780	0.871795	0.223301	0.534884	0.315068	0.337559	0.756055	0.402883	XGB
1	NaN	656	21	81	22	780	0.869231	0.213592	0.511628	0.301370	0.346526	0.734128	0.379118	CatBoost
1	0.871795	657	20	80	23	780	0.871795	0.223301	0.534884	0.315068	0.337559	0.756055	0.402883	Log XGB
1	0.888462	670	7	80	23	780	0.888462	0.223301	0.766667	0.345865	NaN	NaN	NaN	EnsV1
1	0.891026	667	10	75	28	780	0.891026	0.271845	0.736842	0.397163	NaN	NaN	NaN	EnsV4

	score	true_negatives	false_positives	false_negatives	true_positives	example_count	accuracy_score	recall_score	precision_score	f1_score	log_loss	roc_auc	precision_recall_auc	Model	Profit	No Model Profit	Profit Boost From Model
1	0.865385	638	39	66	37	780	0.865385	0.359223	0.486842	0.413408	0.341163	0.782292	0.439211	Logistic Regression	877.7	373.65	134.898970
1	0.873077	644	33	66	37	780	0.873077	0.359223	0.528571	0.427746	0.337587	0.785131	0.440876	Log Logistic Regression	883.7	373.65	136.504750
1	0.871795	652	25	75	28	780	0.871795	0.271845	0.528302	0.358974	0.365266	0.735842	0.400491	Slim Log Logistic Regression	891.7	373.65	138.645792
1	0.879487	665	12	82	21	780	0.879487	0.203883	0.636364	0.308824	0.335726	0.751846	0.416268	Random Forest	904.7	373.65	142.124983
1	0.871795	657	20	80	23	780	0.871795	0.223301	0.534884	0.315068	0.337559	0.756055	0.402883	XGB	896.7	373.65	139.983942
1	NaN	656	21	81	22	780	0.869231	0.213592	0.511628	0.301370	0.346526	0.734128	0.379118	CatBoost	895.7	373.65	139.716312
1	0.871795	657	20	80	23	780	0.871795	0.223301	0.534884	0.315068	0.337559	0.756055	0.402883	Log XGB	896.7	373.65	139.983942
1	0.888462	670	7	80	23	780	0.888462	0.223301	0.766667	0.345865	NaN	NaN	NaN	EnsV1	909.7	373.65	143.463134
1	0.891026	667	10	75	28	780	0.891026	0.271845	0.736842	0.397163	NaN	NaN	NaN	EnsV4	906.7	373.65	142.660244

	CustomerId	Choice	ensv4
39	110	1	1
51	157	1	1
131	141	1	1
150	95	1	1
154	194	1	1
237	186	1	1
256	353	1	1
280	76	1	1
316	200	1	1
347	234	1	1
377	29	1	1
401	254	1	1
409	125	1	1
426	395	1	1
438	96	1	1
446	151	1	1
469	335	1	1
490	118	1	1
506	227	1	1
546	365	1	1
548	193	1	1
554	123	1	1
564	178	1	1
596	343	1	1
597	237	1	1
710	38	1	1
742	33	1	1
750	99	1	1

Logistic Regression

Table of Contents

User Defined Functions ¶

Background

Exploratory Data Analysis

Handling Imbalanced Data via Resampling

Data Modeling: No Resample, No Transformations

Logistic Regression

Random Forest

eXtreme Gradient Boost (XGB)

CatBoost

Data Modeling After Log Transformation

Random Forest

eXtreme Gradient Boost

CatBoost

Data Modeling: Feature Importance

Feature Importance: Embedded RF Approach

Logistic Regression

Random Forest

eXtreme Gradient Boost

CatBoost

Ensembling

Results ¶

Best Metrics from Each Model Type ¶

Findings ¶

Feature Importance¶

Marketing List ¶

Name	Type	Description
Choice	int	If the customer purchased the The Art History of Florence. 1 = purchase and 0 = non-purchase
Gender	int	0 = Female and 1 = Male
Amount_purchased	int	Total money spent on BBBC books
Frequency	int	Total number of purchases in the chosen period
Last_purchase	int	Months since last purchase
First_purchase	int	Months since first purchase
P_Child	int	Number of children’s books purchased
P_Youth	int	Number of youth’s books purchased
P_Cook	int	Number of cookbooks purchased
P_DIY	int	Number of do-it-yourself books purchased
P_Art	int	Number of art books purchased

	Gender	Amount_purchased	Frequency	Last_purchase	First_purchase	P_Child	P_Cook	P_DIY	P_Art
576	1	210	12	1	12	0	0	1	0
1522	0	34	4	1	4	0	1	0	0
465	1	303	2	2	6	0	1	0	0
488	1	266	34	2	40	0	0	0	1
451	0	340	14	4	26	3	1	0	0

	OR
P_Art	2.601529
Amount_purchased	1.001957
Frequency	0.502203
First_purchase	0.810282

	CustomerId	Choice	ensv4
0	284	1	0
1	244	1	0
2	1134	0	0
3	440	0	0
4	820	0	0
...	...	...	...
775	1179	0	0
776	577	0	0
777	1309	0	0
778	760	0	0
779	1277	0	0

	CustomerId	Choice	ensv4
39	110	1	1
51	157	1	1
131	141	1	1
150	95	1	1
154	194	1	1
237	186	1	1
256	353	1	1
280	76	1	1
316	200	1	1
347	234	1	1
377	29	1	1
401	254	1	1
409	125	1	1
426	395	1	1
438	96	1	1
446	151	1	1
469	335	1	1
490	118	1	1
506	227	1	1
546	365	1	1
548	193	1	1
554	123	1	1
564	178	1	1
596	343	1	1
597	237	1	1
710	38	1	1
742	33	1	1
750	99	1	1

	CustomerId	Choice	ensv4
39	110	1	1
51	157	1	1
131	141	1	1
150	95	1	1
154	194	1	1
237	186	1	1
256	353	1	1
280	76	1	1
316	200	1	1
347	234	1	1
377	29	1	1
401	254	1	1
409	125	1	1
426	395	1	1
438	96	1	1
446	151	1	1
469	335	1	1
490	118	1	1
506	227	1	1
546	365	1	1
548	193	1	1
554	123	1	1
564	178	1	1
596	343	1	1
597	237	1	1
710	38	1	1
742	33	1	1
750	99	1	1

	CustomerId	Choice	ensv4
39	110	1	1
51	157	1	1
131	141	1	1
150	95	1	1
154	194	1	1
237	186	1	1
256	353	1	1
280	76	1	1
316	200	1	1
347	234	1	1
377	29	1	1
401	254	1	1
409	125	1	1
426	395	1	1
438	96	1	1
446	151	1	1
469	335	1	1
490	118	1	1
506	227	1	1
546	365	1	1
548	193	1	1
554	123	1	1
564	178	1	1
596	343	1	1
597	237	1	1
710	38	1	1
742	33	1	1
750	99	1	1