In [36]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import json


from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.metrics import PrecisionRecallDisplay, precision_recall_curve
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict
from sklearn import metrics
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.ensemble import VotingClassifier
import statsmodels.api as sm
import mlflow
import mlflow.sklearn
from statistics import mean
from mlflow.models.signature import infer_signature
import shap
from catboost import CatBoostClassifier, Pool

Table of Contents

  • User Defined Functions
  • Background
  • Exploratory Data Analysis
  • Handling Imbalanced Data
  • Data Modeling: No Resample, No Transformations
    • Logistic Regression
    • Random Forest
    • Extreme Gradient Boost
    • CatBoost
  • Data Modeling: Resampled, Transformed
    • Logistic Regression
    • Random Forest
    • Extreme Gradient Boost
    • CatBoost
  • Data Modeling: Feature Selection
    • Feature Importance: Embedded Approach
      • Logistic Regression
      • Random Forest
      • Extreme Gradient Boost
      • CatBoost
  • Ensembling

  • Results

    • Best Metrics
    • Findings
    • Marketing List

User Defined Functions ¶

  • Back to Top
In [267]:
class myModel:
    def __init__(self, name, model):
        self.name=name
        self.model=model
        
        
    def log_transform(self, columns:list):
        """Log transforms columns from dataframe that has been cross validated with a hold out dataset

        Args:
            columns: A list of strings representing variable names

        Returns:
            2 train & test dataframes with log transformations applied to applicable columns
        """
        for col in columns:
            Xtrain[col]=np.log(Xtrain[col])
            Xtest[col]=np.log(Xtest[col])

        return Xtrain,Xtest



    def make_data(self, str1:str, str2:str, str3:str):
        """Performs hold out cross validation on dataset with options for selecting variables

        Args:
            str1: A string denoting if the function should return a full or selective dataset
            str2: A string denoting if the function should perform sampling or not
            str3: A list of strings representing variables to select in feature selection


        Returns:
            4 datasets of train test split data with applicable manipulations dependent upon the arguments given (i.e., feature selection, oversampling, none)
        """
        train = pd.read_excel('BBBC-Train.xlsx')
        test = pd.read_excel('BBBC-Test.xlsx')
        data = pd.concat([test, train])
        data.drop('Observation', axis=1, inplace=True)# keeping this column creates highly accurate results about 97% 
                                                        # seems like overfitting...
    

        if str2 == 'before':
            if str1 == 'full':
                X = data.drop(['Choice'], axis=1)
                y = data['Choice']

                Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=7)
                return Xtrain, Xtest, ytrain, ytest
            elif str1 == 'slim':
                X = data[str3]
                y = data['Choice']
                Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=7)
                return Xtrain, Xtest, ytrain, ytest
            else:
                print('Error')
        elif str2 == 'after':
            if str1 == 'full':
                X = data.drop(['Choice'], axis=1)
                y = data['Choice']
                Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=7)
              # define Pipeline and do random undersampling then SMOTE oversampling of dataset
              # [.8, .5, .2] over

                under = RandomUnderSampler(sampling_strategy=.2)#.2
                over = SMOTE(sampling_strategy=.3) # want about 1500 in minority class and 2000 majority

              # define pipeline steps
                steps = [('u', under), ('o', over)]
                pipeline = Pipeline(steps=steps)
                Xtrain, ytrain = pipeline.fit_resample(Xtrain, ytrain)
                return Xtrain, Xtest, ytrain, ytest
            elif str1 == 'slim':
                X = data[str3]
                y = data['Choice']
                Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=7)
                # define Pipeline and do random undersampling then SMOTE oversampling of dataset
                under = RandomUnderSampler(sampling_strategy=.2)
                over = SMOTE(sampling_strategy=.3) # want about 1500 in minority class and 2000 majority

                # define pipeline steps
                steps = [('u', under), ('o', over)]
                pipeline = Pipeline(steps=steps)
                Xtrain, ytrain = pipeline.fit_resample(Xtrain, ytrain)
                return Xtrain, Xtest, ytrain, ytest
        else:
            print('Error')


    def fit_model(self,model:str,scoring:list,tuning=False, params=list):
        """Fits model and scores with cross validations based on selected metrics
        
        Args:
            model: A chosen Ml model for training
            scoring: A list of metrics to use for scoring over default 5 folds
            tuning: A boolean representing whether hyper parameter tuning should be performed
            params: A dictionary of parameters to be tuned for the given model
            
        Returns:
            1. fit model
            2. A dict holding average metrics over each CV fold
        
        
        """
        model=model
        if tuning == False:

            fit_model = model.fit(Xtrain,ytrain)

            scores = cross_validate(model, Xtest, ytest, scoring=scoring)
            
            # print(f"Average Test Accuracy Score: {scores['test_accuracy'].mean()}")
            
            return fit_model, scores
            
        elif tuning == True:
            
            model = GridSearchCV(model, params, cv=5)

            model.fit(Xtrain,ytrain)

            fit_model = model.best_estimator_

            scores = cross_validate(model, Xtest, ytest, scoring=scoring)
            # print(f"Average Test Accuracy Score: {scores['test_accuracy'].mean()}")
            
            return fit_model, scores
        


 
    
    
    
def get_profit(df):
    
    
   
    # profit = revenue - cost

    #cost per book
    mailing_cost = 0.65
    manuf_cost = 15
    overhead_cost = manuf_cost * .45

    #revenue
    selling_price = 31.95
    
    all_cust = 50000
    # profit per book
    prof_book = (selling_price - mailing_cost - manuf_cost - overhead_cost)

    n_cust = 103 # customers who purchased in test set
    total_cust = 780 # total custoemrs in test set

    pct_buying = .132
    
    
    cost_nomodel = mailing_cost*all_cust
    resp_nomodel = pct_buying*all_cust

    # create empty columns
    df['Profit'] = 0
    df['No Model Profit'] = 0
    df['Profit Boost From Model'] = 0

    for i in df.index:
        not_sold_by_model = df['false_negatives'][i] # false negatives
        sold_by_model = df['true_positives'][i] # true positives 
        missed_opportunities = df['false_positives'][i] # false positives
        total_mailed = not_sold_by_model + sold_by_model
        no_model_total = n_cust
        total_cust = 780

        # subtracting mailing expenses and calculating profit for total costs predicted by model

        # penalizes for false positives
        profit = (((not_sold_by_model + sold_by_model) * prof_book)- missed_opportunities) - (total_mailed * mailing_cost)
        
      
         # uncomment for original
        no_model_profit = ((no_model_total * prof_book)-no_model_total) - (total_cust * mailing_cost)


        df['Profit'][i] = profit 
        df['No Model Profit'][i] = no_model_profit
        df['Profit Boost From Model'][i] = ((profit - no_model_profit)/no_model_profit)*100
        


    return df






def calc_metrics2(models):

    
    models = models[['Model','accuracy_score', 'precision_score','f1_score', 'recall_score']]
     
    pd.melt(models, id_vars=['Model'], value_vars=['accuracy_score', 'precision_score', 'f1_score', 'recall_score'])

    fig = px.bar(models, x="Model", y="value",
                color = "variable", barmode="group")

    fig.show()
    

def json_to_df(json_file, name):
      
    # store json file to dictionary
    with open(json_file) as file:
        data = json.load(file)
    data = pd.DataFrame(data, index=[x for x in range(1, len(data))])
    data['Model'] = name
    
    return data
           

Background

  • Back to Top

In 1994, about 50,000 new titles were published in the US each year, giving rise to a $20B book publishing industry. About 10% of these books are sold through mail order. Recently, online superstores such as Amazon have emerged, carrying 1-2.5M titles and further intensifying the pressure on book clubs and mail order firms. In response to these pressures, book clubs are starting to look at alternative business models that will make them more responsive to their customer’s preferences.

The BBBC, Bookbinders Book Club, was established in 1986 for the purpose of selling specialty books through direct marketing. BBBC is strictly a distributor and does not publish any of the books it sells. In anticipation of using database marketing, BBBC made a strategic decision right from the start to build and maintain a detailed database about its members containing all relevant information about them. Readers fill out an insert and return it to BBBC which then enters the data into the database.

BBBC is exploring whether to use predictive modeling approaches to improve the efficacy of its direct mail program. For this analysis, we will use a subset of the database available to BBBC. The dependent variable (i.e., response group) for the analysis is Choice – purchase or no purchase of the book. BBBC also selected several independent variables that it thought might explain the observed choice behavior. The variables are:

Name Type Description
Choice int If the customer purchased the The Art History of Florence. 1 = purchase and 0 = non-purchase
Gender int 0 = Female and 1 = Male
Amount_purchased int Total money spent on BBBC books
Frequency int Total number of purchases in the chosen period
Last_purchase int Months since last purchase
First_purchase int Months since first purchase
P_Child int Number of children’s books purchased
P_Youth int Number of youth’s books purchased
P_Cook int Number of cookbooks purchased
P_DIY int Number of do-it-yourself books purchased
P_Art int Number of art books purchased

Exploratory Data Analysis

  • Back to Top

Problem Type:
This is a supervised classficiation problem using structured data since we are predicting a binary response 0 or 1 for no purchase or purchase, respectivley

Business Goals/Requirements:

  • Goal:
    • Develop the most accurate model for predicting customers who will purchase the marketing material
    • Have above 50% accuracy (better than random chance)
    • Maximize profit
  • Requirements:
    • Optimize for Precision. We want to ensure our model is doing the best job of predicting purchase out of all users who purchase vs optimizing for Recall (predicting purchase out of all users). Precision on purchase will act as our north star during this modeling process among other metrics like recall, f1, and test accuracy
    • Minimize False Positives. It would be a poor utilization of resources for the company to predict a customer will purchase and they do not purchase.

To avoid data leakage I perform hold out cross valdiation techniques before exploring the data:

In [38]:
model = LogisticRegression(random_state=7, max_iter=1000)
m1 = myModel('EDA',model)

# make data
x_train, x_test, y_train, y_test = m1.make_data("full", "before", _)
In [ ]:
 
In [39]:
x_train.head()
Out[39]:
Gender Amount_purchased Frequency Last_purchase First_purchase P_Child P_Youth P_Cook P_DIY P_Art
576 1 210 12 1 12 0 0 0 1 0
1522 0 34 4 1 4 0 0 1 0 0
465 1 303 2 2 6 0 0 1 0 0
488 1 266 34 2 40 0 0 0 0 1
451 0 340 14 4 26 3 0 1 0 0
In [40]:
# how many observations will we be training with?
x_train.shape
Out[40]:
(3120, 10)
In [41]:
# is response variable balanced?
fig, axs = plt.subplots(2)
fig.suptitle('Observing Imbalanced Data')
axs[0].hist(y_train)
axs[1].hist(y_test)
Out[41]:
(array([677.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0., 103.]),
 array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ]),
 <BarContainer object of 10 artists>)

The response variable is highly imbalanced. Leaving data in its current state could lead to a model that is a poor predictor for the minority class 1 which is our focus in this modeling process. One of our goals is to best predict customers who will purchase and a model that predicts otherwise will be ineffective. There are many techniques to overcome this, I will attemmpt 2 and observe the results:

  1. Oversampling with SMOTE: This technique is more robust than ordinary Oversampling. Traditionally we would just clone the same data points until the variable is more optimally balanced. This leads to poor performance as no new information is added to the model and one observation will outweigh all others. Synthetic Minority Oversampling Technique seeks to correct this by synthesizing new examples from combinations of existing examples.
  2. Undersampling: I will use traditional undersampling to remove oberservations from the majority class. Research has shown this approach outperforms only Oversampling (SMOTE: Synthetic Minority Over-sampling Technique, 2011)

Modeling Approaches: This is a supervised classification problem so a few modeling approaches come to mind:

  • Logistic Regression: it is interpretable and a good baseline model. (Maybe a Naive Bayes model but unsure about all variables being independent of eachother)
  • Random Forest (RF): Strong step up from Logit model where we can also gain feature importance from this Embedded Feature Selection approach. We will also tune and observe results
  • Support Vector Classifier (SVC): There are less than 10,000 observations so this a viable and powerful approach to this type of problem due to its kernel trick. We will tune this for optimal results as otherwise we could observe poor performance
  • XGB: tuning can be time consuming so we will need to select our parameters wisely and it works well on small datasets compared to Random Froest models that generally do not perform well on small data
  • Ensembling: Will take the best performing models (per our requirements!) and tie together those learnings for what is hopefully an optimal model

Let's start exploring the basics of the data...

In [42]:
# what kind of data types in train and test
print("Training:")
print(x_train.info())
print()
print("Testing:")
print(x_test.info())
Training:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3120 entries, 576 to 175
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   Gender            3120 non-null   int64
 1   Amount_purchased  3120 non-null   int64
 2   Frequency         3120 non-null   int64
 3   Last_purchase     3120 non-null   int64
 4   First_purchase    3120 non-null   int64
 5   P_Child           3120 non-null   int64
 6   P_Youth           3120 non-null   int64
 7   P_Cook            3120 non-null   int64
 8   P_DIY             3120 non-null   int64
 9   P_Art             3120 non-null   int64
dtypes: int64(10)
memory usage: 268.1 KB
None

Testing:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 780 entries, 284 to 1277
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   Gender            780 non-null    int64
 1   Amount_purchased  780 non-null    int64
 2   Frequency         780 non-null    int64
 3   Last_purchase     780 non-null    int64
 4   First_purchase    780 non-null    int64
 5   P_Child           780 non-null    int64
 6   P_Youth           780 non-null    int64
 7   P_Cook            780 non-null    int64
 8   P_DIY             780 non-null    int64
 9   P_Art             780 non-null    int64
dtypes: int64(10)
memory usage: 67.0 KB
None

We observe a few things here:

  • all data is of type integer
  • there are no null values in the train or test set

Let's use describe() to see some statistical metrics:

In [43]:
x_train.describe()
Out[43]:
Gender Amount_purchased Frequency Last_purchase First_purchase P_Child P_Youth P_Cook P_DIY P_Art
count 3120.000000 3120.000000 3120.000000 3120.000000 3120.000000 3120.000000 3120.000000 3120.000000 3120.000000 3120.000000
mean 0.670192 196.717949 12.815385 3.106410 22.627564 0.725000 0.347115 0.771795 0.394231 0.372436
std 0.470219 94.960142 8.052841 2.942795 15.906823 1.018906 0.644874 1.032576 0.692815 0.675507
min 0.000000 15.000000 2.000000 1.000000 2.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.000000 122.000000 6.000000 1.000000 12.000000 0.000000 0.000000 0.000000 0.000000 0.000000
50% 1.000000 199.000000 12.000000 2.000000 18.000000 0.000000 0.000000 0.000000 0.000000 0.000000
75% 1.000000 268.000000 16.000000 4.000000 30.000000 1.000000 1.000000 1.000000 1.000000 1.000000
max 1.000000 474.000000 36.000000 12.000000 96.000000 8.000000 5.000000 6.000000 4.000000 5.000000

Doesn't appear to be any awkwardness to the data meaning theres no negative minimums for Amount_purchased. If I was modeling in R I may convert Gender to a Factor but will leave the variable as is in Python since most libraries/algorithms can handle binary variables directly without having to convert.

In [44]:
fig, axs = plt.subplots(2,5, figsize = (20,10))
fig.suptitle('Observing Normality in Data')
axs[0][0].hist(x_train['Gender'])
axs[0][1].hist(x_train['Amount_purchased'])
axs[0][2].hist(x_train['Frequency'])
axs[0][3].hist(x_train['Last_purchase'])
axs[0][4].hist(x_train['First_purchase'])
axs[1][0].hist(x_train['P_Child'])
axs[1][1].hist(x_train['P_Youth'])
axs[1][2].hist(x_train['P_Cook'])
axs[1][3].hist(x_train['P_DIY'])
axs[1][4].hist(x_train['P_Art'])

#define subplot titles
axs[0, 0].set_title('Gender')
axs[0, 1].set_title('Amount_purchased')
axs[0, 2].set_title('Frequency')
axs[0, 3].set_title('Last_purchase')
axs[0, 4].set_title('First_purchase')
axs[1, 0].set_title('P_Child')
axs[1, 1].set_title('P_Youth')
axs[1, 2].set_title('P_Cook')
axs[1, 3].set_title('P_DIY')
axs[1, 4].set_title('P_Art')
Out[44]:
Text(0.5, 1.0, 'P_Art')

Disregarding Gender, All variables appears slightly right skewed. We could do a log transformation and see if that recenters (normalizes) the data. The log transformation does not work in the range of 0 to 6 because the logarithm function is undefined for values less than or equal to zero. We have many values at zero for multiple variables (i.e., P_Cook). We will not log transform these variables as removing this data would be removing the majority

In [45]:
# log transform data
log_xtrain = np.log(x_train)


fig, axs = plt.subplots(1,4, figsize = (20,10))
fig.suptitle('Observing Normality in Data')
#axs[0][0].hist(log_xtrain['Gender'])
axs[0].hist(log_xtrain['Amount_purchased'])
axs[1].hist(log_xtrain['Frequency'])
axs[2].hist(log_xtrain['Last_purchase'])
axs[3].hist(log_xtrain['First_purchase'])
#axs[1][0].hist(log_xtrain['P_Child'])
# axs[1][1].hist(log_xtrain['P_Youth'])
# axs[1][2].hist(log_xtrain['P_Cook'])
# axs[1][3].hist(log_xtrain['P_DIY'])
# axs[1][4].hist(log_xtrain['P_Art'])

#define subplot titles
#axs[0, 0].set_title('Gender')
axs[0].set_title('Amount_purchased')
axs[1].set_title('Frequency')
axs[2].set_title('Last_purchase')
axs[3].set_title('First_purchase')
# axs[1, 0].set_title('P_Child')
# axs[1, 1].set_title('P_Youth')
# axs[1, 2].set_title('P_Cook')
# axs[1, 3].set_title('P_DIY')
# axs[1, 4].set_title('P_Art')
divide by zero encountered in log
Out[45]:
Text(0.5, 1.0, 'First_purchase')

No Transformation Will Be Applied:

  • Amount_purchased becomes less normalized so we will not transform this variable.
  • Last_purchase is outweighted by the high volume of users sitting around 0 the trasnformation did not help so we will not transform this variable

Transformation Applied:

  • Frequency and
  • First_purchase did benefit from the transformation and become more normally distirbuted

Next I will look for any relationships between dependent variables and their relationships with the independent. We will observe a pairplot but want to include the response so we will need to read in the data again

In [46]:
train0 = pd.read_excel('BBBC-Train.xlsx')
test0 = pd.read_excel('BBBC-Test.xlsx')
data0 = pd.concat([test0, train0])
#data0.drop('Observation', axis=1, inplace=True)
plt.figure(figsize=(1,5))
sns.pairplot(data0)
plt.show()
<Figure size 100x500 with 0 Axes>

The clearest relationship is with First_purchase and Amount_purchased. The more months since first purchase the more the customer is likely to spend. Let's plot a heatmap of the same relationships

In [47]:
plt.figure(figsize=(12, 7))
sns.heatmap(data0.corr(), annot = True, vmin=0, vmax=1)
plt.show()

Multicollinearity is when two or more independent variables in a regression model are highly correlated with each other. This can be a problem because it can make it difficult to identify the unique effects of each variable on the dependent variable. Additionally, because the independent variables are correlated, the coefficients of the independent variables in the regression equation can change a lot when a new data point is added or when a data point is removed, making the model unstable and unreliable. In this study any correlation over 75% is considered highly correleated

Things to consider from heatmap:

  • With our response variable Choice there is a small relationship with P_Art stating whether the customer purchased art books in the past. Thsi makes sense as the marketing marterial is for an art book.
  • Strong realtionship between First_purchase and Last_purchase also remembering that First_purchase was more normally distributed
  • Somewhat strong relationship with Last_purchase and P_Child as well as Last_purchase and P_Cook. This model may benefit from removing Last_purchase variable since it is so correlated with multiple variables
  • I'm confused by the high correlation between the response variable and Observation. This appears to be an id column that denotes individual customers. How does that indicate purchase?

Let's rethink what the variables represent to identify applications for feature engineering:

Name Type Description
Choice int If the customer purchased the The Art History of Florence. 1 = purchase and 0 = non-purchase
Gender int 0 = Female and 1 = Male
Amount_purchased int Total money spent on BBBC books
Frequency int Total number of purchases in the chosen period
Last_purchase int Months since last purchase
First_purchase int Months since first purchase
P_Child int Number of children’s books purchased
P_Youth int Number of youth’s books purchased
P_Cook int Number of cookbooks purchased
P_DIY int Number of do-it-yourself books purchased
P_Art int Number of art books purchased
  • In the real world I would get with stakeholders to clarify the Frequency variable more and define what it means to be "within the chosen period". Is that within the sampling period, within a given month?
  • without greater context any feature engineering is only speculation and unreliable.
    • For example, if we knew what specifically was purchased during that Frequency that would be more telling.
    • Having a timestamp variable would make things clearer as well. We could then hone in on which months see more purchases and begin building a forecasting model. We do not have that information so we will keep the variables as they stand.

Handling Imbalanced Data via Resampling

  • Back to Top

    NOTE: I originally modeled without the Observation variable and obtained much lower merics like the logistic regression shown below.

TODO: Explain metrics: Precision is low for the range of Recall due to the data imbalance and PR curve is outputting an L shape. ROC curve is nto robust to data imbalance and swings wildly so is not completely reliable

After tracking and comparing runs in MlFlow we see there are differences with sampling. It is much better at placing true positives but also has many more false positives

image.png

Logit Model Before Sampling

confusion_matrix.png

Logit Model After Sampling

confusion_matrix-2.png

As shown in the confusion matrices the model has gotten better at predicting our minority class after resampling

In [48]:
# make model
model = LogisticRegression(random_state=7, max_iter=1000)

m1 = myModel('Logistic Regression',model)

# make data
Xtrain,Xtest,ytrain,ytest = m1.make_data("full", "before", _)
print(ytrain.hist())

# make model
logit_model, logit_scores = m1.fit_model(model, ['f1','accuracy'], tuning=False)

# make predictions
logit_preds = m1.make_preds(logit_model)

# make roc plots
logit_df = m1.make_roc(logit_preds)

# make classification report
logit_rpt = m1.make_report(logit_preds, logit_df)

# make confusion matrix
logit_cm = m1.make_cm(logit_preds, logit_model, logit_rpt)
Axes(0.125,0.11;0.775x0.77)
Average Test Accuracy Score: 0.8833333333333332
In [49]:
# make model
model = LogisticRegression(random_state=7, max_iter=1000)

m1 = myModel('Logistic Regression',model)

# make data
Xtrain,Xtest,ytrain,ytest = m1.make_data("full", "after", _)
print(ytrain.hist())

# make model
logit_model, logit_scores = m1.fit_model(model, ['f1','accuracy'], tuning=False)

# make predictions
logit_preds = m1.make_preds(logit_model)

# make roc plots
logit_df = m1.make_roc(logit_preds)

# make classification report
logit_rpt = m1.make_report(logit_preds, logit_df)

# make confusion matrix
logit_cm = m1.make_cm(logit_preds, logit_model, logit_rpt)
Axes(0.125,0.11;0.775x0.77)
Average Test Accuracy Score: 0.8833333333333332

Data Modeling: No Resample, No Transformations

  • Back to Top

All of my modeling was tracked using MlFlow, an open-source tool to easily track model runs, parameters, and metrics as yoy iterate through the modeling process. The most optimal models were selected based on a combination of:

  • f1 on test data
  • precision on test minority class
  • accuracy on test data
  • recall on test data
  • number of True Positives and False Positives

Logistic Regression

  • Back to Top
In [145]:
mlflow.set_tracking_uri("")

mlflow.set_experiment(experiment_name="BBBC_modeling")


with mlflow.start_run(run_name='F_b_logit'):
    
    model = LogisticRegression(random_state=7, max_iter=1000)

    m1 = myModel('Logistic Regression',model)
    
   
    # make data
    Xtrain,Xtest,ytrain,ytest = m1.make_data("full", "before", _)
    print("Data Loaded")
    print("Building Model...")
 


    # make model
    logit_model,logit_scores = m1.fit_model(model, ['f1','accuracy', 'precision', 'recall'], tuning=False)

    
    #signature?
    signature = infer_signature(Xtest, logit_model.predict(Xtest))
    
    #Build the Evaluation Dataset from the test set
    eval_data = Xtest
    eval_data["label"] = ytest
    
    undersamp = .2
    oversamp = .3
    
    
    # log metrics
    mlflow.log_metric('accuracy', mean(logit_scores['test_accuracy']))
    mlflow.log_metric('f1', mean(logit_scores['test_f1']))
    mlflow.log_metric('precision', mean(logit_scores['test_precision']))
    mlflow.log_metric('recall', mean(logit_scores['test_recall']))
    mlflow.log_metric('oversample', oversamp)
    mlflow.log_metric('undersample', undersamp)
    

    # log model
    mlflow.sklearn.log_model(logit_model, "logit model")
    model_uri = mlflow.get_artifact_uri("logit model")
    print("Model Loaded")
    
    # evaluate model
    result = mlflow.evaluate(
        model_uri,
        eval_data,
        targets="label",
        model_type="classifier",
        evaluators=["default"],
    )
    
    


    
mlflow.end_run()
Data Loaded
Average Test Accuracy Score: 0.8833333333333332
/Users/coolkid/anaconda3/lib/python3.10/site-packages/mlflow/models/signature.py:137: UserWarning:

Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.

2023/06/13 16:11:02 WARNING mlflow.pyfunc: Detected one or more mismatches between the model's dependencies and the current Python environment:
 - mlflow (current: 2.3.2, required: mlflow==2.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
2023/06/13 16:11:02 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/06/13 16:11:02 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
Model Loaded
2023/06/13 16:11:02 INFO mlflow.models.evaluation.default_evaluator: Shap explainer Linear is used.
/Users/coolkid/anaconda3/lib/python3.10/site-packages/mlflow/shap.py:459: UserWarning:

Unable to serialize underlying model using MLflow, will use SHAP serialization

2023/06/13 16:11:02 WARNING mlflow.models.evaluation.default_evaluator: Logging explainer failed. Reason: AttributeError("'LogisticRegression' object has no attribute 'save'"). Set logging level to DEBUG to see the full traceback.
/Users/coolkid/anaconda3/lib/python3.10/site-packages/shap/plots/_beeswarm.py:340: UserWarning:

No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored

/Users/coolkid/anaconda3/lib/python3.10/site-packages/shap/plots/_beeswarm.py:664: UserWarning:

No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored

Random Forest

  • Back to Top
In [147]:
# no significant improvment with log transformation

mlflow.set_tracking_uri("")

mlflow.set_experiment(experiment_name="BBBC_modeling")


with mlflow.start_run(run_name='b_rf_final'):
    model = RandomForestClassifier(random_state=7)

    m1 = myModel('Tuned Random Forest',model)

    # make data
    
    '''
    
    Random forest is an ideal algorithm to deal with the extreme imbalance owing to two main reasons. 
    Firstly, the ability to incorporate class weights into the random forest classifier makes it cost-sensitive; 
    hence it penalizes misclassifying the minority class. 
    
    '''
    
    Xtrain,Xtest,ytrain,ytest = m1.make_data("full", "before", _)
    print("Data Loaded")
    print("Building Model...")
 
       
    # make model
    params = {'n_estimators': [25,50,100,200,500],
                 'max_depth': [2,5,8, 50]}
    rf_model, rf_scores = m1.fit_model(model, ['f1','accuracy', 'precision', 'recall'], tuning=True, params=params)
    
    
    #signature?
    signature = infer_signature(Xtest, rf_model.predict(Xtest))
    
    #Build the Evaluation Dataset from the test set
    eval_data = Xtest
    eval_data["label"] = ytest
    
    undersamp = 0
    oversamp = 0
 
    
    rf_params = rf_model.get_params()
    
    # # log parameters
    mlflow.log_param('n_estimators', rf_params['n_estimators'])
    mlflow.log_param('max_depth', rf_params['max_depth'])
   
       
    
    # log metrics
    mlflow.log_metric('accuracy', mean(rf_scores['test_accuracy']))
    mlflow.log_metric('f1', mean(rf_scores['test_f1']))
    mlflow.log_metric('precision', mean(rf_scores['test_precision']))
    mlflow.log_metric('recall', mean(rf_scores['test_recall']))
    mlflow.log_metric('oversample', oversamp)
    mlflow.log_metric('undersample', undersamp)

    # log model
    mlflow.sklearn.log_model(rf_model, "log rf model")
    model_uri = mlflow.get_artifact_uri("log rf model")
    print("Model Loaded")
    
    
    explainer = shap.Explainer(rf_model, Xtest)
    shap_values = explainer(Xtest)
    shap_exp = shap_values
    
    #evaluate model
    result = mlflow.evaluate(
        model_uri,
        eval_data,
        targets="label",
        model_type="classifier",
        evaluators="default",
        evaluator_config= {"log_model_explainability":False} # error called when creating beeswarm plot,
    )
    
     # save results in json
    result.save("")
    print("Metrics Saved")




    
mlflow.end_run()
Data Loaded
Building Model...
Average Test Accuracy Score: 0.8705128205128204
/Users/coolkid/anaconda3/lib/python3.10/site-packages/mlflow/models/signature.py:137: UserWarning:

Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.

Model Loaded
 98%|===================| 1534/1560 [00:21<00:00]        2023/06/13 16:15:27 WARNING mlflow.pyfunc: Detected one or more mismatches between the model's dependencies and the current Python environment:
 - mlflow (current: 2.3.2, required: mlflow==2.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
2023/06/13 16:15:27 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/06/13 16:15:27 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
Metrics Saved
<Figure size 1050x700 with 0 Axes>

eXtreme Gradient Boost (XGB)

  • Back to Top
In [148]:
# no improvement with log transformattion

mlflow.set_tracking_uri("")

mlflow.set_experiment(experiment_name="BBBC_modeling")


with mlflow.start_run(run_name='b_xgb_final'):
    model = xgb.XGBClassifier(random_state=7,
                             eta=.2,
                             max_depth=3,
                             gamma=1,
                             subsample=1)

    m1 = myModel('Tuned XGB Model',model)

    # make data
    Xtrain,Xtest,ytrain,ytest = m1.make_data("full", "before", _)
    print("Data Loaded")
    print("Building Model...")
 
     

    # make model
    params = {'eta' : [0.01, 0.2],
          'max_depth': [3],
          'gamma': [0 , .1],
          'subsample': [0.5, 1]}
    
    tuned_xgb_model, tuned_xgb_scores = m1.fit_model(model, ['f1','accuracy', 'precision', 'recall'], tuning=False, params=params)

    
    #signature?
    signature = infer_signature(Xtest, tuned_xgb_model.predict(Xtest))
    
    #Build the Evaluation Dataset from the test set
    eval_data = Xtest
    eval_data["label"] = ytest
    undersample = 0
    oversample = 0
    
    
    xgb_params = tuned_xgb_model.get_params()
    
    # log parameters
    mlflow.log_param('eta', xgb_params['eta'])
    mlflow.log_param('max_depth', xgb_params['max_depth'])
    mlflow.log_param('gamma', xgb_params['gamma'])
    mlflow.log_param('subsample', xgb_params['subsample'])
   
    
    
    # log metrics
    mlflow.log_metric('accuracy', mean(tuned_xgb_scores['test_accuracy']))
    mlflow.log_metric('f1', mean(tuned_xgb_scores['test_f1']))
    mlflow.log_metric('precision', mean(tuned_xgb_scores['test_precision']))
    mlflow.log_metric('recall', mean(tuned_xgb_scores['test_recall']))
    mlflow.log_metric('oversample', oversample)
    mlflow.log_metric('undersample', undersample)

    # log model
    mlflow.sklearn.log_model(tuned_xgb_model, "xgb model")
    model_uri = mlflow.get_artifact_uri("xgb model")
    print("Model Loaded")

    
    # evaluate model
    result = mlflow.evaluate(
        model_uri,
        eval_data,
        targets="label",
        model_type="classifier",
        evaluators=["default"],
    )
    
    # save results in json
    result.save("")
    print("Metrics Saved")
    


    
mlflow.end_run()
Data Loaded
Building Model...
Average Test Accuracy Score: 0.8743589743589745
/Users/coolkid/anaconda3/lib/python3.10/site-packages/mlflow/models/signature.py:137: UserWarning:

Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.

2023/06/13 16:16:02 WARNING mlflow.pyfunc: Detected one or more mismatches between the model's dependencies and the current Python environment:
 - mlflow (current: 2.3.2, required: mlflow==2.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
2023/06/13 16:16:02 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/06/13 16:16:02 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
Model Loaded
2023/06/13 16:16:02 INFO mlflow.models.evaluation.default_evaluator: Shap explainer Tree is used.
/Users/coolkid/anaconda3/lib/python3.10/site-packages/mlflow/shap.py:459: UserWarning:

Unable to serialize underlying model using MLflow, will use SHAP serialization

2023/06/13 16:16:03 WARNING mlflow.models.evaluation.default_evaluator: Logging explainer failed. Reason: AttributeError("'TreeEnsemble' object has no attribute 'save'"). Set logging level to DEBUG to see the full traceback.
/Users/coolkid/anaconda3/lib/python3.10/site-packages/shap/plots/_beeswarm.py:340: UserWarning:

No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored

/Users/coolkid/anaconda3/lib/python3.10/site-packages/shap/plots/_beeswarm.py:664: UserWarning:

No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored

Metrics Saved
In [54]:
# catboost_model.get_all_params()
In [55]:
# catboost_model.get_params()

CatBoost

  • Back to Top
In [149]:
# no significant difference after log transformation

mlflow.set_tracking_uri("")

mlflow.set_experiment(experiment_name="BBBC_modeling")


with mlflow.start_run(run_name='b_catboost_final'):
    
    
    params = {'iterations':[2,4, 8, 20, 50],
            'depth': [6,7,8,9]}


    model = CatBoostClassifier(bagging_temperature=0,
                               learning_rate=1,
                           loss_function='Logloss',
                           verbose=False,
                           random_state=7)

    m1 = myModel('CatBoost Model',model)

    # make data
    Xtrain,Xtest,ytrain,ytest = m1.make_data("full", "before", _)
    print("Data Loaded")
    print("Building Model...")
    
    catboost_model, catboost_scores = m1.fit_model(model, ['f1','accuracy', 'precision', 'recall'], tuning=True, params=params)
    
    catboost_params = catboost_model.get_params()

    
    #signature?
    signature = infer_signature(Xtest, catboost_model.predict(Xtest))
    
    #Build the Evaluation Dataset from the test set
    eval_data = Xtest
    eval_data["label"] = ytest
    undersample = 0
    oversample = 0
    
    
     # log parameters
    mlflow.log_param('iterations', catboost_params['iterations'])
    mlflow.log_param('learning_rate', catboost_params['learning_rate'])
    mlflow.log_param('depth', catboost_params['depth'])
    mlflow.log_param('bagging_temperature', catboost_params['bagging_temperature'])
    mlflow.log_param('loss_function', catboost_params['loss_function'])
    
    # log metrics
    mlflow.log_metric('accuracy', mean(catboost_scores['test_accuracy']))
    mlflow.log_metric('f1', mean(catboost_scores['test_f1']))
    mlflow.log_metric('precision', mean(catboost_scores['test_precision']))
    mlflow.log_metric('recall', mean(catboost_scores['test_recall']))
    mlflow.log_metric('oversample', oversample)
    mlflow.log_metric('undersample', undersample)
     
      

    # log model
    mlflow.sklearn.log_model(catboost_model, "catboost model")
    model_uri = mlflow.get_artifact_uri("catboost model")
    print("Model Loaded")

    
    # evaluate model
    result = mlflow.evaluate(
        model_uri,
        eval_data,
        targets="label",
        model_type="classifier",
        evaluators=["default"],
    )
    
    # save results in json
    result.save("")
    print("Metrics Saved")
    


    
mlflow.end_run()
Data Loaded
Building Model...
Average Test Accuracy Score: 0.8743589743589745
/Users/coolkid/anaconda3/lib/python3.10/site-packages/mlflow/models/signature.py:137: UserWarning:

Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.

2023/06/13 16:17:13 WARNING mlflow.pyfunc: Detected one or more mismatches between the model's dependencies and the current Python environment:
 - mlflow (current: 2.3.2, required: mlflow==2.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
2023/06/13 16:17:13 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/06/13 16:17:13 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2023/06/13 16:17:13 WARNING mlflow.models.evaluation.default_evaluator: Computing sklearn model score failed: TypeError("CatBoostClassifier.score() got an unexpected keyword argument 'sample_weight'"). Set logging level to DEBUG to see the full traceback.
Model Loaded
2023/06/13 16:17:14 INFO mlflow.models.evaluation.default_evaluator: Shap explainer Tree is used.
/Users/coolkid/anaconda3/lib/python3.10/site-packages/mlflow/shap.py:459: UserWarning:

Unable to serialize underlying model using MLflow, will use SHAP serialization

2023/06/13 16:17:14 WARNING mlflow.models.evaluation.default_evaluator: Logging explainer failed. Reason: AttributeError("'TreeEnsemble' object has no attribute 'save'"). Set logging level to DEBUG to see the full traceback.
/Users/coolkid/anaconda3/lib/python3.10/site-packages/shap/plots/_beeswarm.py:340: UserWarning:

No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored

/Users/coolkid/anaconda3/lib/python3.10/site-packages/shap/plots/_beeswarm.py:664: UserWarning:

No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored

Metrics Saved

Data Modeling After Log Transformation

Logistic Regression

  • Back to Top
In [150]:
# significant improvement after log trasnformation and resampling

mlflow.set_tracking_uri("")

mlflow.set_experiment(experiment_name="BBBC_modeling")


with mlflow.start_run(run_name='a_log_logit_final'):
    
    model = LogisticRegression(random_state=7, max_iter=1000)

    m1 = myModel('Log Logistic Regression',model)
    
   
    # make data
    Xtrain,Xtest,ytrain,ytest = m1.make_data("full", "after", _)
    print("Data Loaded")
    print("Building Model...")
    
     # log transform data
    Xtrain, Xtest = m1.log_transform(['Frequency', 'First_purchase'])

    # make model
    log_logit_model,log_logit_scores = m1.fit_model(model, ['f1','accuracy', 'precision', 'recall'], tuning=False)
    
    
    #signature?
    signature = infer_signature(Xtest, log_logit_model.predict(Xtest))
    
    #Build the Evaluation Dataset from the test set
    eval_data = Xtest
    eval_data["label"] = ytest
    
    undersamp = .2
    oversamp = .3

   
    
    
    # log metrics
    mlflow.log_metric('accuracy', mean(log_logit_scores['test_accuracy']))
    mlflow.log_metric('f1', mean(log_logit_scores['test_f1']))
    mlflow.log_metric('precision', mean(log_logit_scores['test_precision']))
    mlflow.log_metric('recall', mean(log_logit_scores['test_recall']))
    mlflow.log_metric('oversample', oversamp)
    mlflow.log_metric('undersample', undersamp)
    

    # log model
    mlflow.sklearn.log_model(log_logit_model, "log_logit model after002003")
    model_uri = mlflow.get_artifact_uri("log_logit model after002003")
    print("Model Loaded")
    
    # evaluate model
    result = mlflow.evaluate(
        model_uri,
        eval_data,
        targets="label",
        model_type="classifier",
        evaluators=["default"],
    )
    
    # save results in json
    result.save("")
    print("Metrics Saved")


    
mlflow.end_run()
Data Loaded
Building Model...
Average Test Accuracy Score: 0.8923076923076924
/Users/coolkid/anaconda3/lib/python3.10/site-packages/mlflow/models/signature.py:137: UserWarning:

Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.

2023/06/13 16:21:46 WARNING mlflow.pyfunc: Detected one or more mismatches between the model's dependencies and the current Python environment:
 - mlflow (current: 2.3.2, required: mlflow==2.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
2023/06/13 16:21:46 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/06/13 16:21:46 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
Model Loaded
2023/06/13 16:21:47 INFO mlflow.models.evaluation.default_evaluator: Shap explainer Linear is used.
/Users/coolkid/anaconda3/lib/python3.10/site-packages/mlflow/shap.py:459: UserWarning:

Unable to serialize underlying model using MLflow, will use SHAP serialization

2023/06/13 16:21:47 WARNING mlflow.models.evaluation.default_evaluator: Logging explainer failed. Reason: AttributeError("'LogisticRegression' object has no attribute 'save'"). Set logging level to DEBUG to see the full traceback.
/Users/coolkid/anaconda3/lib/python3.10/site-packages/shap/plots/_beeswarm.py:340: UserWarning:

No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored

/Users/coolkid/anaconda3/lib/python3.10/site-packages/shap/plots/_beeswarm.py:664: UserWarning:

No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored

Metrics Saved
In [ ]:
 

Random Forest

  • Back to Top
In [58]:
# no significant improvment with log transformation

mlflow.set_tracking_uri("")

mlflow.set_experiment(experiment_name="BBBC_modeling")


with mlflow.start_run(run_name='b_log_rf_final'):
    model = RandomForestClassifier(random_state=7)

    m1 = myModel('Log Random Forest',model)

    # make data
    
    '''
    
    Random forest is an ideal algorithm to deal with the extreme imbalance owing to two main reasons. 
    Firstly, the ability to incorporate class weights into the random forest classifier makes it cost-sensitive; 
    hence it penalizes misclassifying the minority class. 
    
    '''
    
    Xtrain,Xtest,ytrain,ytest = m1.make_data("full", "before", _)
    print("Data Loaded")
    print("Building Model...")
    
     # log transform data
    Xtrain, Xtest = m1.log_transform(['Frequency', 'First_purchase'])

       
    # make model
    params = {'n_estimators': [25,50,100,200,500],
                 'max_depth': [2,5,8, 50]}
    log_rf_model, log_rf_scores = m1.fit_model(model, ['f1','accuracy', 'precision', 'recall'], tuning=True, params=params)
    
 
    #signature?
    signature = infer_signature(Xtest, log_rf_model.predict(Xtest))
    
    #Build the Evaluation Dataset from the test set
    eval_data = Xtest
    eval_data["label"] = ytest
    
    undersamp = 0
    oversamp = 0
 
    
    log_rf_params = log_rf_model.get_params()
    
    # # log parameters
    mlflow.log_param('n_estimators', log_rf_params['n_estimators'])
    mlflow.log_param('max_depth', log_rf_params['max_depth'])
   
   
    
    
    # log metrics
    mlflow.log_metric('accuracy', mean(log_rf_scores['test_accuracy']))
    mlflow.log_metric('f1', mean(log_rf_scores['test_f1']))
    mlflow.log_metric('precision', mean(log_rf_scores['test_precision']))
    mlflow.log_metric('recall', mean(log_rf_scores['test_recall']))
    mlflow.log_metric('oversample', oversamp)
    mlflow.log_metric('undersample', undersamp)

    # log model
    mlflow.sklearn.log_model(log_rf_model, "log rf model")
    model_uri = mlflow.get_artifact_uri("log rf model")
    print("Model Loaded")
    
    
    explainer = shap.Explainer(log_rf_model, Xtest)
    shap_values = explainer(Xtest)
    shap_exp = shap_values
    
    #evaluate model
    result = mlflow.evaluate(
        model_uri,
        eval_data,
        targets="label",
        model_type="classifier",
        evaluators="default",
        evaluator_config= {"log_model_explainability":False} # error called when creating beeswarm plot,
    )
    
    # save results in json
    result.save("")
    print("Metrics Saved")

    
mlflow.end_run()
Data Loaded
Average Test Accuracy Score: 0.8743589743589745
Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.
Model Loaded
2023/06/13 14:37:09 WARNING mlflow.pyfunc: Detected one or more mismatches between the model's dependencies and the current Python environment:
 - mlflow (current: 2.3.2, required: mlflow==2.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
2023/06/13 14:37:09 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/06/13 14:37:09 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
<Figure size 1050x700 with 0 Axes>

eXtreme Gradient Boost

  • Back to Top
In [152]:
# no improvement with log transformattion

mlflow.set_tracking_uri("")

mlflow.set_experiment(experiment_name="BBBC_modeling")


with mlflow.start_run(run_name='b_log_xgb_final'):
    model = xgb.XGBClassifier(random_state=7,
                             eta=.2,
                             max_depth=3,
                             gamma=1,
                             subsample=1)

    m1 = myModel('Log XGB Model',model)

    # make data
    Xtrain,Xtest,ytrain,ytest = m1.make_data("full", "before", _)
    print("Data Loaded")
    print("Building Model...")
    
     # log transform data
    Xtrain, Xtest = m1.log_transform(['Frequency', 'First_purchase'])
  

    # make model
    params = {'eta' : [0.2],
          'max_depth': [3],
          'gamma': [0 , .1],
          'subsample': [1]}
    log_tuned_xgb_model, log_tuned_xgb_scores = m1.fit_model(model, ['f1','accuracy', 'precision', 'recall'], tuning=False, params=params)
    
    #signature?
    signature = infer_signature(Xtest, log_tuned_xgb_model.predict(Xtest))
    
    #Build the Evaluation Dataset from the test set
    eval_data = Xtest
    eval_data["label"] = ytest
    undersample = 0
    oversample = 0
    
    
    log_xgb_params = log_tuned_xgb_model.get_params()
    
    # log parameters
    mlflow.log_param('eta', log_xgb_params['eta'])
    mlflow.log_param('max_depth', log_xgb_params['max_depth'])
    mlflow.log_param('gamma', log_xgb_params['gamma'])
    mlflow.log_param('subsample', log_xgb_params['subsample'])
   
    
    
    # log metrics
    mlflow.log_metric('accuracy', mean(log_tuned_xgb_scores['test_accuracy']))
    mlflow.log_metric('f1', mean(log_tuned_xgb_scores['test_f1']))
    mlflow.log_metric('precision', mean(log_tuned_xgb_scores['test_precision']))
    mlflow.log_metric('recall', mean(log_tuned_xgb_scores['test_recall']))
    mlflow.log_metric('oversample', oversample)
    mlflow.log_metric('undersample', undersample)

    # log model
    mlflow.sklearn.log_model(log_tuned_xgb_model, "xgb model")
    model_uri = mlflow.get_artifact_uri("xgb model")
    print("Model Loaded")

    
    # evaluate model
    result = mlflow.evaluate(
        model_uri,
        eval_data,
        targets="label",
        model_type="classifier",
        evaluators=["default"],
    )
    
    # save results in json
    result.save("")
    print("Metrics Saved")


    
mlflow.end_run()
Data Loaded
Building Model...
Average Test Accuracy Score: 0.8743589743589745
/Users/coolkid/anaconda3/lib/python3.10/site-packages/mlflow/models/signature.py:137: UserWarning:

Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.

2023/06/13 16:27:44 WARNING mlflow.pyfunc: Detected one or more mismatches between the model's dependencies and the current Python environment:
 - mlflow (current: 2.3.2, required: mlflow==2.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
2023/06/13 16:27:44 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/06/13 16:27:44 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
Model Loaded
2023/06/13 16:27:44 INFO mlflow.models.evaluation.default_evaluator: Shap explainer Tree is used.
/Users/coolkid/anaconda3/lib/python3.10/site-packages/mlflow/shap.py:459: UserWarning:

Unable to serialize underlying model using MLflow, will use SHAP serialization

2023/06/13 16:27:45 WARNING mlflow.models.evaluation.default_evaluator: Logging explainer failed. Reason: AttributeError("'TreeEnsemble' object has no attribute 'save'"). Set logging level to DEBUG to see the full traceback.
/Users/coolkid/anaconda3/lib/python3.10/site-packages/shap/plots/_beeswarm.py:340: UserWarning:

No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored

/Users/coolkid/anaconda3/lib/python3.10/site-packages/shap/plots/_beeswarm.py:664: UserWarning:

No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored

Metrics Saved

CatBoost

  • Back to Top
In [60]:
mlflow.set_tracking_uri("")

mlflow.set_experiment(experiment_name="BBBC_modeling")


with mlflow.start_run(run_name='b_log_catboost_final'):
    
    
    params = {'iterations':[2,4],
            'depth': [6,7,8,9],
             'bagging_temperature': [0, 10, 50]}


    model = CatBoostClassifier(learning_rate=1,
                           loss_function='Logloss',
                           verbose=False,
                           random_state=7)

    m1 = myModel('Log CatBoost Model',model)

    # make data
    Xtrain,Xtest,ytrain,ytest = m1.make_data("full", "before", _)
    print("Data Loaded")
    print("Building Model...")
    
     # log transform data
    Xtrain, Xtest = m1.log_transform(['Frequency', 'First_purchase'])
    
    log_catboost_model, log_catboost_scores = m1.fit_model(model, ['f1','accuracy', 'precision', 'recall'], tuning=True, params=params)
    
    log_catboost_params = log_catboost_model.get_params()

    
    #signature?
    signature = infer_signature(Xtest, log_catboost_model.predict(Xtest))
    
    #Build the Evaluation Dataset from the test set
    eval_data = Xtest
    eval_data["label"] = ytest
    undersample = 0
    oversample = 0
    
    
     # log parameters
    mlflow.log_param('iterations', log_catboost_params['iterations'])
    mlflow.log_param('learning_rate', log_catboost_params['learning_rate'])
    mlflow.log_param('depth', log_catboost_params['depth'])
    mlflow.log_param('bagging_temperature', log_catboost_params['bagging_temperature'])
    mlflow.log_param('loss_function', log_catboost_params['loss_function'])
    
    # log metrics
    mlflow.log_metric('accuracy', mean(log_catboost_scores['test_accuracy']))
    mlflow.log_metric('f1', mean(log_catboost_scores['test_f1']))
    mlflow.log_metric('precision', mean(log_catboost_scores['test_precision']))
    mlflow.log_metric('recall', mean(log_catboost_scores['test_recall']))
    mlflow.log_metric('oversample', oversample)
    mlflow.log_metric('undersample', undersample)
     
      

    # log model
    mlflow.sklearn.log_model(log_catboost_model, "log_catboost model")
    model_uri = mlflow.get_artifact_uri("log_catboost model")
    print("Model Loaded")

    
    # evaluate model
    result = mlflow.evaluate(
        model_uri,
        eval_data,
        targets="label",
        model_type="classifier",
        evaluators=["default"],
    )
    
    # save results in json
    result.save("")
    print("Metrics Saved")


    
mlflow.end_run()
Data Loaded
Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.
Average Test Accuracy Score: 0.8666666666666666
Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.
2023/06/13 14:37:19 WARNING mlflow.pyfunc: Detected one or more mismatches between the model's dependencies and the current Python environment:
 - mlflow (current: 2.3.2, required: mlflow==2.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
2023/06/13 14:37:19 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/06/13 14:37:19 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2023/06/13 14:37:19 WARNING mlflow.models.evaluation.default_evaluator: Computing sklearn model score failed: TypeError("CatBoostClassifier.score() got an unexpected keyword argument 'sample_weight'"). Set logging level to DEBUG to see the full traceback.
Model Loaded
2023/06/13 14:37:20 INFO mlflow.models.evaluation.default_evaluator: Shap explainer Tree is used.
Unable to serialize underlying model using MLflow, will use SHAP serialization
2023/06/13 14:37:20 WARNING mlflow.models.evaluation.default_evaluator: Logging explainer failed. Reason: AttributeError("'TreeEnsemble' object has no attribute 'save'"). Set logging level to DEBUG to see the full traceback.
No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored
No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored

Data Modeling: Feature Importance

Only the Logistic Regression model significantly improved after log transformations so all other models will not have this transformation performed going forward in the analysis.

Feature Importance: Embedded RF Approach

  • Back to Top
In [61]:
importances = rf_model.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf_model.estimators_], axis=0)
feature_names = [i for i in rf_model.feature_names_in_]
forest_importances = pd.Series(importances, index=feature_names)

fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=std, ax=ax)
ax.set_title("Feature importances")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()

The following variables will be considered important:

  • P_Art
  • Amount_purchased
  • Frequency
  • First_purchase

Logistic Regression

  • Back to Top
In [151]:
# significant improvement after slim trasnformation

mlflow.set_tracking_uri("")

mlflow.set_experiment(experiment_name="BBBC_modeling")


with mlflow.start_run(run_name='a_slim_log_logit_final'):
    
    model = LogisticRegression(random_state=7, max_iter=1000)

    m1 = myModel('Slim Log Logistic Regression',model)
    
   
    # make data
    Xtrain,Xtest,ytrain,ytest = m1.make_data("slim", "after", ['P_Art', 'Amount_purchased','Frequency', 'First_purchase'])
    print("Data Loaded")
    print("Building Model...")
    
     # log transform data
    Xtrain, Xtest = m1.log_transform(['Frequency', 'First_purchase'])

    # make model
    slim_log_logit_model,slim_log_logit_scores = m1.fit_model(model, ['f1','accuracy', 'precision', 'recall'], tuning=False)
    
    #signature?
    signature = infer_signature(Xtest, slim_log_logit_model.predict(Xtest))
    
    #Build the Evaluation Dataset from the test set
    eval_data = Xtest
    eval_data["label"] = ytest
    
    undersamp = .2
    oversamp = .3
    
    # log metrics
    mlflow.log_metric('accuracy', mean(slim_log_logit_scores['test_accuracy']))
    mlflow.log_metric('f1', mean(slim_log_logit_scores['test_f1']))
    mlflow.log_metric('precision', mean(slim_log_logit_scores['test_precision']))
    mlflow.log_metric('recall', mean(slim_log_logit_scores['test_recall']))
    mlflow.log_metric('oversample', oversamp)
    mlflow.log_metric('undersample', undersamp)
    

    # log model
    mlflow.sklearn.log_model(slim_log_logit_model, "slim_log_logit model after002003")
    model_uri = mlflow.get_artifact_uri("slim_log_logit model after002003")
    print("Model Loaded")
    
    # evaluate model
    result = mlflow.evaluate(
        model_uri,
        eval_data,
        targets="label",
        model_type="classifier",
        evaluators=["default"],
    )
    # save results in json
    result.save("")
    print("Metrics Saved")


    
mlflow.end_run()
Data Loaded
Building Model...
Average Test Accuracy Score: 0.8846153846153847
/Users/coolkid/anaconda3/lib/python3.10/site-packages/mlflow/models/signature.py:137: UserWarning:

Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.

2023/06/13 16:25:54 WARNING mlflow.pyfunc: Detected one or more mismatches between the model's dependencies and the current Python environment:
 - mlflow (current: 2.3.2, required: mlflow==2.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
2023/06/13 16:25:54 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/06/13 16:25:54 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
Model Loaded
2023/06/13 16:25:54 INFO mlflow.models.evaluation.default_evaluator: Shap explainer Linear is used.
/Users/coolkid/anaconda3/lib/python3.10/site-packages/mlflow/shap.py:459: UserWarning:

Unable to serialize underlying model using MLflow, will use SHAP serialization

2023/06/13 16:25:54 WARNING mlflow.models.evaluation.default_evaluator: Logging explainer failed. Reason: AttributeError("'LogisticRegression' object has no attribute 'save'"). Set logging level to DEBUG to see the full traceback.
/Users/coolkid/anaconda3/lib/python3.10/site-packages/shap/plots/_beeswarm.py:340: UserWarning:

No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored

/Users/coolkid/anaconda3/lib/python3.10/site-packages/shap/plots/_beeswarm.py:664: UserWarning:

No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored

Metrics Saved

Random Forest

  • Back to Top
In [63]:
# no significant improvment with slim transformation

mlflow.set_tracking_uri("")

mlflow.set_experiment(experiment_name="BBBC_modeling")


with mlflow.start_run(run_name='b_slim_rf_model_final'):
    model = RandomForestClassifier(random_state=7)

    m1 = myModel('Slim Random Forest',model)

    # make data
    
    '''
    
    Random forest is an ideal algorithm to deal with the extreme imbalance owing to two main reasons. 
    Firstly, the ability to incorporate class weights into the random forest classifier makes it cost-sensitive; 
    hence it penalizes misclassifying the minority class. 
    
    '''
    
    Xtrain,Xtest,ytrain,ytest = m1.make_data("slim", "before",  ['P_Art', 'Amount_purchased','Frequency', 'First_purchase'])
    print("Data Loaded")
    print("Building Model...")
    
       
    # make model
    params = {'n_estimators': [25,50,100,200,500],
                 'max_depth': [2,5,8, 50]}
    slim_rf_model, slim_rf_scores = m1.fit_model(model, ['f1','accuracy', 'precision', 'recall'], tuning=True, params=params)
    
    
    #signature?
    signature = infer_signature(Xtest, slim_rf_model.predict(Xtest))
    
    #Build the Evaluation Dataset from the test set
    eval_data = Xtest
    eval_data["label"] = ytest
    
    undersamp = 0
    oversamp = 0
 
    
    slim_rf_params = slim_rf_model.get_params()
    
    # # log parameters
    mlflow.log_param('n_estimators', slim_rf_params['n_estimators'])
    mlflow.log_param('max_depth', slim_rf_params['max_depth'])
   
   
    
    
    # log metrics
    mlflow.log_metric('accuracy', mean(slim_rf_scores['test_accuracy']))
    mlflow.log_metric('f1', mean(slim_rf_scores['test_f1']))
    mlflow.log_metric('precision', mean(slim_rf_scores['test_precision']))
    mlflow.log_metric('recall', mean(slim_rf_scores['test_recall']))
    mlflow.log_metric('oversample', oversamp)
    mlflow.log_metric('undersample', undersamp)

    # log model
    mlflow.sklearn.log_model(slim_rf_model, "slim_log rf model")
    model_uri = mlflow.get_artifact_uri("slim_log rf model")
    print("Model Loaded")
    
    
    explainer = shap.Explainer(slim_rf_model, Xtest)
    shap_values = explainer(Xtest)
    shap_exp = shap_values
    
    #evaluate model
    result = mlflow.evaluate(
        model_uri,
        eval_data,
        targets="label",
        model_type="classifier",
        evaluators="default",
        evaluator_config= {"log_model_explainability":False} # error called when creating beeswarm plot,
    )
    # save results in json
    result.save("")
    print("Metrics Saved")


    
mlflow.end_run()
Data Loaded
Average Test Accuracy Score: 0.8705128205128204
Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.
Model Loaded
 99%|===================| 1538/1560 [00:18<00:00]        2023/06/13 14:38:56 WARNING mlflow.pyfunc: Detected one or more mismatches between the model's dependencies and the current Python environment:
 - mlflow (current: 2.3.2, required: mlflow==2.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
2023/06/13 14:38:56 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/06/13 14:38:56 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
<Figure size 1050x700 with 0 Axes>

eXtreme Gradient Boost

  • Back to Top
In [64]:
# worse with slim transformation

mlflow.set_tracking_uri("")

mlflow.set_experiment(experiment_name="BBBC_modeling")


with mlflow.start_run(run_name='b_slim_xgb_final'):
    model = xgb.XGBClassifier(random_state=7,
                             eta=.2,
                             max_depth=3,
                             gamma=1,
                             subsample=1)

    m1 = myModel('Slim XGB Model',model)

    # make data
    Xtrain,Xtest,ytrain,ytest = m1.make_data("slim", "before", ['P_Art', 'Amount_purchased','Frequency', 'First_purchase'])
    print("Data Loaded")
    print("Building Model...")
     

    # make model
    params = {'eta' : [0.01, 0.2],
          'max_depth': [3],
          'gamma': [0 , .1],
          'subsample': [0.5, 1]}
    
    slim_tuned_xgb_model, slim_tuned_xgb_scores = m1.fit_model(model, ['f1','accuracy', 'precision', 'recall'], tuning=False, params=params)

    #signature?
    signature = infer_signature(Xtest, slim_tuned_xgb_model.predict(Xtest))
    
    #Build the Evaluation Dataset from the test set
    eval_data = Xtest
    eval_data["label"] = ytest
    undersample = 0
    oversample = 0
    
    
    slim_xgb_params = slim_tuned_xgb_model.get_params()
    
    # log parameters
    mlflow.log_param('eta', slim_xgb_params['eta'])
    mlflow.log_param('max_depth', slim_xgb_params['max_depth'])
    mlflow.log_param('gamma', slim_xgb_params['gamma'])
    mlflow.log_param('subsample', slim_xgb_params['subsample'])
   
    
    
    # log metrics
    mlflow.log_metric('accuracy', mean(slim_tuned_xgb_scores['test_accuracy']))
    mlflow.log_metric('f1', mean(slim_tuned_xgb_scores['test_f1']))
    mlflow.log_metric('precision', mean(slim_tuned_xgb_scores['test_precision']))
    mlflow.log_metric('recall', mean(slim_tuned_xgb_scores['test_recall']))
    mlflow.log_metric('oversample', oversample)
    mlflow.log_metric('undersample', undersample)

    # log model
    mlflow.sklearn.log_model(slim_tuned_xgb_model, "slim_xgb model")
    model_uri = mlflow.get_artifact_uri("slim_xgb model")
    print("Model Loaded")

    
    # evaluate model
    result = mlflow.evaluate(
        model_uri,
        eval_data,
        targets="label",
        model_type="classifier",
        evaluators=["default"],
    )
    
    # save results in json
    result.save("")
    print("Metrics Saved")


    
mlflow.end_run()
Data Loaded
Average Test Accuracy Score: 0.8807692307692306
Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.
2023/06/13 14:38:59 WARNING mlflow.pyfunc: Detected one or more mismatches between the model's dependencies and the current Python environment:
 - mlflow (current: 2.3.2, required: mlflow==2.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
2023/06/13 14:38:59 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/06/13 14:38:59 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
Model Loaded
2023/06/13 14:39:00 INFO mlflow.models.evaluation.default_evaluator: Shap explainer Tree is used.
Unable to serialize underlying model using MLflow, will use SHAP serialization
2023/06/13 14:39:00 WARNING mlflow.models.evaluation.default_evaluator: Logging explainer failed. Reason: AttributeError("'TreeEnsemble' object has no attribute 'save'"). Set logging level to DEBUG to see the full traceback.
No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored
No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored

CatBoost

  • Back to Top
In [65]:
# worse with slim transformation

mlflow.set_tracking_uri("")

mlflow.set_experiment(experiment_name="BBBC_modeling")


with mlflow.start_run(run_name='b_slim_catboost_final'):
    
    
    params = {'iterations':[2,4, 8, 20, 50],
            'depth': [6,7,8,9]}


    model = CatBoostClassifier(bagging_temperature=0,
                               learning_rate=1,
                           loss_function='Logloss',
                           verbose=False,
                           random_state=7)

    m1 = myModel('Slim CatBoost Model',model)

    # make data
    Xtrain,Xtest,ytrain,ytest = m1.make_data("slim", "before", ['P_Art', 'Amount_purchased','Frequency', 'First_purchase'])
    print("Data Loaded")
    print("Building Model...")
    
    slim_catboost_model, slim_catboost_scores = m1.fit_model(model, ['f1','accuracy', 'precision', 'recall'], tuning=True, params=params)
    
    slim_catboost_params = slim_catboost_model.get_params()
   
    #signature?
    signature = infer_signature(Xtest, slim_catboost_model.predict(Xtest))
    
    #Build the Evaluation Dataset from the test set
    eval_data = Xtest
    eval_data["label"] = ytest
    undersample = 0
    oversample = 0
    
    
     # log parameters
    mlflow.log_param('iterations', slim_catboost_params['iterations'])
    mlflow.log_param('learning_rate', slim_catboost_params['learning_rate'])
    mlflow.log_param('depth', slim_catboost_params['depth'])
    mlflow.log_param('bagging_temperature', slim_catboost_params['bagging_temperature'])
    mlflow.log_param('loss_function', slim_catboost_params['loss_function'])
    
    # log metrics
    mlflow.log_metric('accuracy', mean(slim_catboost_scores['test_accuracy']))
    mlflow.log_metric('f1', mean(slim_catboost_scores['test_f1']))
    mlflow.log_metric('precision', mean(slim_catboost_scores['test_precision']))
    mlflow.log_metric('recall', mean(slim_catboost_scores['test_recall']))
    mlflow.log_metric('oversample', oversample)
    mlflow.log_metric('undersample', undersample)
     
      

    # log model
    mlflow.sklearn.log_model(slim_catboost_model, "slim_catboost model")
    model_uri = mlflow.get_artifact_uri("slim_catboost model")
    print("Model Loaded")

    
    # evaluate model
    result = mlflow.evaluate(
        model_uri,
        eval_data,
        targets="label",
        model_type="classifier",
        evaluators=["default"],
    )

    # save results in json
    result.save("")
    print("Metrics Saved")

    
mlflow.end_run()
Data Loaded
Average Test Accuracy Score: 0.873076923076923
Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.
2023/06/13 14:39:11 WARNING mlflow.pyfunc: Detected one or more mismatches between the model's dependencies and the current Python environment:
 - mlflow (current: 2.3.2, required: mlflow==2.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
2023/06/13 14:39:11 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/06/13 14:39:11 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
2023/06/13 14:39:11 WARNING mlflow.models.evaluation.default_evaluator: Computing sklearn model score failed: TypeError("CatBoostClassifier.score() got an unexpected keyword argument 'sample_weight'"). Set logging level to DEBUG to see the full traceback.
Model Loaded
2023/06/13 14:39:12 INFO mlflow.models.evaluation.default_evaluator: Shap explainer Tree is used.
Unable to serialize underlying model using MLflow, will use SHAP serialization
2023/06/13 14:39:12 WARNING mlflow.models.evaluation.default_evaluator: Logging explainer failed. Reason: AttributeError("'TreeEnsemble' object has no attribute 'save'"). Set logging level to DEBUG to see the full traceback.
No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored
No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored

Ensembling

  • Back to Top
In [153]:
mlflow.set_tracking_uri("")

mlflow.set_experiment(experiment_name="BBBC_modeling")


with mlflow.start_run(run_name='ensv1_final'):
    
    
    #ens v1
    # estimators=[('logit', logit_model), ('rf_tuned', rf_model), ('xgb', tuned_xgb_model)]
    
    # ens v2
    estimators=[('logit', logit_model), ('rf_tuned', rf_model), ('xgb', tuned_xgb_model), ('tuned_sample_catboost', catboost_model)]


    model = VotingClassifier(estimators, voting='hard')

    m1 = myModel('EnsV1 Model',model)

    # make data
    Xtrain,Xtest,ytrain,ytest = m1.make_data("full", "before", _)
    print("Data Loaded")
    print("Building Model...")
    
    ensemble_v1_model, ensemble_v1_scores = m1.fit_model(model, ['f1','accuracy', 'precision', 'recall'], tuning=False)

    
    #signature?
    signature = infer_signature(Xtest, ensemble_v1_model.predict(Xtest))
    
    #Build the Evaluation Dataset from the test set
    eval_data = Xtest
    eval_data["label"] = ytest
    undersample = 0
    oversample = 0
    models = {'modl1': 'logit',
             'modl2': 'rf_tuned',
             'modl3': 'xgb',
             'modl4': 'tuned_sample_catboost'}
    
    # log params
    mlflow.log_params(models)
    
    
    # log metrics
    mlflow.log_metric('accuracy', mean(ensemble_v1_scores['test_accuracy']))
    mlflow.log_metric('f1', mean(ensemble_v1_scores['test_f1']))
    mlflow.log_metric('precision', mean(ensemble_v1_scores['test_precision']))
    mlflow.log_metric('recall', mean(ensemble_v1_scores['test_recall']))
    mlflow.log_metric('oversample', oversample)
    mlflow.log_metric('undersample', undersample)

    # log model
    mlflow.sklearn.log_model(ensemble_v1_model, "ens1 model")
    model_uri = mlflow.get_artifact_uri("ens1 model")
    print("Model Loaded")

    
    # evaluate model
    result = mlflow.evaluate(
        model_uri,
        eval_data,
        targets="label",
        model_type="classifier",
        evaluators=["default"],
    )
    
    # save results in json
    result.save("")
    print("Metrics Saved")


    
mlflow.end_run()
Data Loaded
Building Model...
/Users/coolkid/anaconda3/lib/python3.10/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning:

Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.

Average Test Accuracy Score: 0.8756410256410255
/Users/coolkid/anaconda3/lib/python3.10/site-packages/mlflow/models/signature.py:137: UserWarning:

Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.

2023/06/13 16:30:05 WARNING mlflow.pyfunc: Detected one or more mismatches between the model's dependencies and the current Python environment:
 - mlflow (current: 2.3.2, required: mlflow==2.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
2023/06/13 16:30:05 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/06/13 16:30:05 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
Model Loaded
2023/06/13 16:30:05 WARNING mlflow.models.evaluation.default_evaluator: Shap evaluation failed. Reason: TypeError("The passed model is not callable and cannot be analyzed directly with the given masker! Model: VotingClassifier(estimators=[('logit',\n                              LogisticRegression(max_iter=1000,\n                                                 random_state=7)),\n                             ('rf_tuned',\n                              RandomForestClassifier(max_depth=8,\n                                                     n_estimators=200,\n                                                     random_state=7)),\n                             ('xgb',\n                              XGBClassifier(base_score=None, booster=None,\n                                            callbacks=None,\n                                            colsample_bylevel=None,\n                                            colsample_bynode=None,\n                                            colsample_bytree=None,\n                                            early_stopping_rounds=None,\n                                            enable_categorical=Fal...\n                                            interaction_constraints=None,\n                                            learning_rate=None, max_bin=None,\n                                            max_cat_threshold=None,\n                                            max_cat_to_onehot=None,\n                                            max_delta_step=None, max_depth=3,\n                                            max_leaves=None,\n                                            min_child_weight=None, missing=nan,\n                                            monotone_constraints=None,\n                                            n_estimators=100, n_jobs=None,\n                                            num_parallel_tree=None,\n                                            predictor=None, ...)),\n                             ('tuned_sample_catboost',\n                              <catboost.core.CatBoostClassifier object at 0x17a8ae560>)])"). Set logging level to DEBUG to see the full traceback.
Metrics Saved
<Figure size 1050x700 with 0 Axes>
In [67]:
mlflow.set_tracking_uri("")

mlflow.set_experiment(experiment_name="BBBC_modeling")


with mlflow.start_run(run_name='ensv2_final'):
    
    
    #ens v1
    # estimators=[('logit', logit_model), ('rf_tuned', rf_model), ('xgb', tuned_xgb_model)]
    
    # ens v2
    estimators=[('log logit', log_logit_model),('logit', logit_model), ('rf_tuned', rf_model), 
                ('xgb', tuned_xgb_model), ('tuned_sample_catboost', catboost_model)]


    model = VotingClassifier(estimators, voting='hard')

    m1 = myModel('EnsV2 Model',model)

    # make data
    Xtrain,Xtest,ytrain,ytest = m1.make_data("full", "before", _)
    print("Data Loaded")
    print("Building Model...")
    
    ensemble_v2_model, ensemble_v2_scores = m1.fit_model(model, ['f1','accuracy', 'precision', 'recall'], tuning=False)

    
    #signature?
    signature = infer_signature(Xtest, ensemble_v2_model.predict(Xtest))
    
    #Build the Evaluation Dataset from the test set
    eval_data = Xtest
    eval_data["label"] = ytest
    undersample = 0
    oversample = 0
    models = {'modl1': 'logit',
             'modl2': 'rf_tuned',
             'modl3': 'xgb',
             'modl4': 'tuned_sample_catboost',
             'modl5': 'log_logit'}
    
    # log params
    mlflow.log_params(models)
    
    
    # log metrics
    mlflow.log_metric('accuracy', mean(ensemble_v2_scores['test_accuracy']))
    mlflow.log_metric('f1', mean(ensemble_v2_scores['test_f1']))
    mlflow.log_metric('precision', mean(ensemble_v2_scores['test_precision']))
    mlflow.log_metric('recall', mean(ensemble_v2_scores['test_recall']))
    mlflow.log_metric('oversample', oversample)
    mlflow.log_metric('undersample', undersample)

    # log model
    mlflow.sklearn.log_model(ensemble_v2_model, "ens2 model")
    model_uri = mlflow.get_artifact_uri("ens2 model")
    print("Model Loaded")

    
    # evaluate model
    result = mlflow.evaluate(
        model_uri,
        eval_data,
        targets="label",
        model_type="classifier",
        evaluators=["default"],
    )


    
mlflow.end_run()
Data Loaded
Average Test Accuracy Score: 0.8756410256410255
Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.
2023/06/13 14:39:23 WARNING mlflow.pyfunc: Detected one or more mismatches between the model's dependencies and the current Python environment:
 - mlflow (current: 2.3.2, required: mlflow==2.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
2023/06/13 14:39:23 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/06/13 14:39:23 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
Model Loaded
2023/06/13 14:39:23 WARNING mlflow.models.evaluation.default_evaluator: Shap evaluation failed. Reason: TypeError("The passed model is not callable and cannot be analyzed directly with the given masker! Model: VotingClassifier(estimators=[('log logit',\n                              LogisticRegression(max_iter=1000,\n                                                 random_state=7)),\n                             ('logit',\n                              LogisticRegression(max_iter=1000,\n                                                 random_state=7)),\n                             ('rf_tuned',\n                              RandomForestClassifier(max_depth=8,\n                                                     n_estimators=200,\n                                                     random_state=7)),\n                             ('xgb',\n                              XGBClassifier(base_score=None, booster=None,\n                                            callbacks=None,\n                                            colsample_bylevel=None,\n                                            colsample_bynode=None,\n                                            colsample...\n                                            interaction_constraints=None,\n                                            learning_rate=None, max_bin=None,\n                                            max_cat_threshold=None,\n                                            max_cat_to_onehot=None,\n                                            max_delta_step=None, max_depth=3,\n                                            max_leaves=None,\n                                            min_child_weight=None, missing=nan,\n                                            monotone_constraints=None,\n                                            n_estimators=100, n_jobs=None,\n                                            num_parallel_tree=None,\n                                            predictor=None, ...)),\n                             ('tuned_sample_catboost',\n                              <catboost.core.CatBoostClassifier object at 0x285186b60>)])"). Set logging level to DEBUG to see the full traceback.
<Figure size 1050x700 with 0 Axes>
In [113]:
mlflow.set_tracking_uri("")

mlflow.set_experiment(experiment_name="BBBC_modeling")


with mlflow.start_run(run_name='ensv3_final'):
    
    
    #ens v1
    # estimators=[('logit', logit_model), ('rf_tuned', rf_model), ('xgb', tuned_xgb_model)]
    
    # ens v2
    estimators=[('log logit', log_logit_model),('logit', logit_model), ('rf_tuned', rf_model), 
                ('xgb', tuned_xgb_model), ('tuned_sample_catboost', catboost_model), 
               ('slim log logit', slim_log_logit_model)]


    model = VotingClassifier(estimators, voting='hard')

    m1 = myModel('EnsV3 Model',model)

    # make data
    Xtrain,Xtest,ytrain,ytest = m1.make_data("full", "before", _)
    print("Data Loaded")
    print("Building Model...")
    
    ensemble_v3_model, ensemble_v3_scores = m1.fit_model(model, ['f1','accuracy', 'precision', 'recall'], tuning=False)
    
    #signature?
    signature = infer_signature(Xtest, ensemble_v3_model.predict(Xtest))
    
    #Build the Evaluation Dataset from the test set
    eval_data = Xtest
    eval_data["label"] = ytest
    undersample = 0
    oversample = 0
    models = {'modl1': 'logit',
             'modl2': 'rf_tuned',
             'modl3': 'xgb',
             'modl4': 'tuned_sample_catboost',
             'modl5': 'log_logit',
             'modl6': 'slim log logit'}
    
    # log params
    mlflow.log_params(models)
    
    
    # log metrics
    mlflow.log_metric('accuracy', mean(ensemble_v3_scores['test_accuracy']))
    mlflow.log_metric('f1', mean(ensemble_v3_scores['test_f1']))
    mlflow.log_metric('precision', mean(ensemble_v3_scores['test_precision']))
    mlflow.log_metric('recall', mean(ensemble_v3_scores['test_recall']))
    mlflow.log_metric('oversample', oversample)
    mlflow.log_metric('undersample', undersample)

    # log model
    mlflow.sklearn.log_model(ensemble_v3_model, "ens3 model")
    model_uri = mlflow.get_artifact_uri("ens3 model")
    print("Model Loaded")

    
    # evaluate model
    result = mlflow.evaluate(
        model_uri,
        eval_data,
        targets="label",
        model_type="classifier",
        evaluators=["default"],
    )
    
    

    
mlflow.end_run()
Data Loaded
Average Test Accuracy Score: 0.8794871794871796
/Users/coolkid/anaconda3/lib/python3.10/site-packages/mlflow/models/signature.py:137: UserWarning:

Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.

2023/06/13 15:33:16 WARNING mlflow.pyfunc: Detected one or more mismatches between the model's dependencies and the current Python environment:
 - mlflow (current: 2.3.2, required: mlflow==2.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
2023/06/13 15:33:16 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/06/13 15:33:16 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
Model Loaded
2023/06/13 15:33:16 WARNING mlflow.models.evaluation.default_evaluator: Shap evaluation failed. Reason: TypeError("The passed model is not callable and cannot be analyzed directly with the given masker! Model: VotingClassifier(estimators=[('log logit',\n                              LogisticRegression(max_iter=1000,\n                                                 random_state=7)),\n                             ('logit',\n                              LogisticRegression(max_iter=1000,\n                                                 random_state=7)),\n                             ('rf_tuned',\n                              RandomForestClassifier(max_depth=8,\n                                                     n_estimators=200,\n                                                     random_state=7)),\n                             ('xgb',\n                              XGBClassifier(base_score=None, booster=None,\n                                            callbacks=None,\n                                            colsample_bylevel=None,\n                                            colsample_bynode=None,\n                                            colsample...\n                                            max_cat_to_onehot=None,\n                                            max_delta_step=None, max_depth=3,\n                                            max_leaves=None,\n                                            min_child_weight=None, missing=nan,\n                                            monotone_constraints=None,\n                                            n_estimators=100, n_jobs=None,\n                                            num_parallel_tree=None,\n                                            predictor=None, ...)),\n                             ('tuned_sample_catboost',\n                              <catboost.core.CatBoostClassifier object at 0x28698aa40>),\n                             ('slim log logit',\n                              LogisticRegression(max_iter=1000,\n                                                 random_state=7))])"). Set logging level to DEBUG to see the full traceback.
<Figure size 1050x700 with 0 Axes>
In [154]:
mlflow.set_tracking_uri("")

mlflow.set_experiment(experiment_name="BBBC_modeling")


with mlflow.start_run(run_name='ensv4_final'):
    
    
    #ens v1
    # estimators=[('logit', logit_model), ('rf_tuned', rf_model), ('xgb', tuned_xgb_model)]
    
    # ens v2
    estimators=[('log logit', log_logit_model),('logit', logit_model), ('rf_tuned', rf_model), 
                ('xgb', tuned_xgb_model), ('tuned_sample_catboost', catboost_model), 
               ('slim log logit', slim_log_logit_model), ('log_xgb', log_tuned_xgb_model)]


    model = VotingClassifier(estimators, voting='hard')

    m1 = myModel('EnsV4 Model',model)

    # make data
    Xtrain,Xtest,ytrain,ytest = m1.make_data("full", "before", _)
    print("Data Loaded")
    print("Building Model...")
    
    ensemble_v4_model, ensemble_v4_scores = m1.fit_model(model, ['f1','accuracy', 'precision', 'recall'], tuning=False)
    
    #signature?
    signature = infer_signature(Xtest, ensemble_v4_model.predict(Xtest))
    
    #Build the Evaluation Dataset from the test set
    eval_data = Xtest
    eval_data["label"] = ytest
    undersample = 0
    oversample = 0
    models = {'modl1': 'logit',
             'modl2': 'rf_tuned',
             'modl3': 'xgb',
             'modl4': 'tuned_sample_catboost',
             'modl5': 'log_logit',
             'modl6': 'slim log logit',
             'modl7': 'log_xgb'}
    
    # log params
    mlflow.log_params(models)
    
    
    # log metrics
    mlflow.log_metric('accuracy', mean(ensemble_v4_scores['test_accuracy']))
    mlflow.log_metric('f1', mean(ensemble_v4_scores['test_f1']))
    mlflow.log_metric('precision', mean(ensemble_v4_scores['test_precision']))
    mlflow.log_metric('recall', mean(ensemble_v4_scores['test_recall']))
    mlflow.log_metric('oversample', oversample)
    mlflow.log_metric('undersample', undersample)

    # log model
    mlflow.sklearn.log_model(ensemble_v4_model, "ens4 model")
    model_uri = mlflow.get_artifact_uri("ens4 model")
    print("Model Loaded")

    
    # evaluate model
    result = mlflow.evaluate(
        model_uri,
        eval_data,
        targets="label",
        model_type="classifier",
        evaluators=["default"],
    )
    
    # save results in json
    result.save("")
    print("Metrics Saved")


    
mlflow.end_run()
Data Loaded
Building Model...
Average Test Accuracy Score: 0.8756410256410255
/Users/coolkid/anaconda3/lib/python3.10/site-packages/mlflow/models/signature.py:137: UserWarning:

Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.

2023/06/13 16:32:35 WARNING mlflow.pyfunc: Detected one or more mismatches between the model's dependencies and the current Python environment:
 - mlflow (current: 2.3.2, required: mlflow==2.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
2023/06/13 16:32:35 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/06/13 16:32:35 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
Model Loaded
2023/06/13 16:32:36 WARNING mlflow.models.evaluation.default_evaluator: Shap evaluation failed. Reason: TypeError("The passed model is not callable and cannot be analyzed directly with the given masker! Model: VotingClassifier(estimators=[('log logit',\n                              LogisticRegression(max_iter=1000,\n                                                 random_state=7)),\n                             ('logit',\n                              LogisticRegression(max_iter=1000,\n                                                 random_state=7)),\n                             ('rf_tuned',\n                              RandomForestClassifier(max_depth=8,\n                                                     n_estimators=200,\n                                                     random_state=7)),\n                             ('xgb',\n                              XGBClassifier(base_score=None, booster=None,\n                                            callbacks=None,\n                                            colsample_bylevel=None,\n                                            colsample_bynode=None,\n                                            colsample...\n                                            feature_types=None, gamma=1,\n                                            gpu_id=None, grow_policy=None,\n                                            importance_type=None,\n                                            interaction_constraints=None,\n                                            learning_rate=None, max_bin=None,\n                                            max_cat_threshold=None,\n                                            max_cat_to_onehot=None,\n                                            max_delta_step=None, max_depth=3,\n                                            max_leaves=None,\n                                            min_child_weight=None, missing=nan,\n                                            monotone_constraints=None,\n                                            n_estimators=100, n_jobs=None,\n                                            num_parallel_tree=None,\n                                            predictor=None, ...))])"). Set logging level to DEBUG to see the full traceback.
Metrics Saved
<Figure size 1050x700 with 0 Axes>
In [165]:
# Loading json files from mlflow evaluation
logit_results = json_to_df("logit_metrics.json", "Logistic Regression")
log_logit_results = json_to_df("log_logit_metrics.json", "Log Logistic Regression")
slim_log_logit_results = json_to_df("slim_log_logit_metrics.json", "Slim Log Logistic Regression")
rf_results = json_to_df("rf_metrics.json", "Random Forest")
tuned_xgb_results = json_to_df("xgb_metrics.json", "XGB")
catboost_results = json_to_df("catboost_metrics.json", "CatBoost")
log_tuned_xgb_results = json_to_df("log_xgb_metrics.json", "Log XGB")
ensv1_results = json_to_df("ensv1_metrics.json", "EnsV1")
ensv4_results = json_to_df("ensv4_metrics.json", "EnsV4")
In [ ]:
 

Results ¶

  • Back to Top
In [310]:
# '''
# TODO concatenate all data frames together
'''

visualize:
    1. Precision on 1 plus others (recall/f1?) for each model
    2. ROC curve for each model
    3. Test Accuracy for each model
    4. preds for 1 for each model
'''    

data_df = pd.concat([logit_results,log_logit_results,slim_log_logit_results,
                    rf_results,tuned_xgb_results,catboost_results,log_tuned_xgb_results,
                    ensv1_results, ensv4_results])


data_df.drop_duplicates(inplace=True)
In [311]:
data_df
Out[311]:
score true_negatives false_positives false_negatives true_positives example_count accuracy_score recall_score precision_score f1_score log_loss roc_auc precision_recall_auc Model
1 0.865385 638 39 66 37 780 0.865385 0.359223 0.486842 0.413408 0.341163 0.782292 0.439211 Logistic Regression
1 0.873077 644 33 66 37 780 0.873077 0.359223 0.528571 0.427746 0.337587 0.785131 0.440876 Log Logistic Regression
1 0.871795 652 25 75 28 780 0.871795 0.271845 0.528302 0.358974 0.365266 0.735842 0.400491 Slim Log Logistic Regression
1 0.879487 665 12 82 21 780 0.879487 0.203883 0.636364 0.308824 0.335726 0.751846 0.416268 Random Forest
1 0.871795 657 20 80 23 780 0.871795 0.223301 0.534884 0.315068 0.337559 0.756055 0.402883 XGB
1 NaN 656 21 81 22 780 0.869231 0.213592 0.511628 0.301370 0.346526 0.734128 0.379118 CatBoost
1 0.871795 657 20 80 23 780 0.871795 0.223301 0.534884 0.315068 0.337559 0.756055 0.402883 Log XGB
1 0.888462 670 7 80 23 780 0.888462 0.223301 0.766667 0.345865 NaN NaN NaN EnsV1
1 0.891026 667 10 75 28 780 0.891026 0.271845 0.736842 0.397163 NaN NaN NaN EnsV4

Best Metrics from Each Model Type ¶

  • Back to Top
In [313]:
# pivoting data for easier reporting
viz_df = data_df

viz_df = viz_df[['Model', 'accuracy_score', 'precision_score', 'f1_score', 'recall_score']]
viz_df = viz_df.sort_values('accuracy_score', ascending=False)

viz_df.plot(x="Model",
           kind="bar",
           stacked=False,
           title = "Top Model Metric Comparison")
Out[313]:
<Axes: title={'center': 'Top Model Metric Comparison'}, xlabel='Model'>

Apolgies for the legend in the way! I originally plotted this using plotLy but the chart did not render in html and I wanted to post this today!!

Greater detail about the metrics can be found below. The purpose of this plot is to highlight the high accuracy achieved by our ensembling approaches and the performance improvement after log transforming and applying feature selection to our baseline model (Logistic Regression)

In [268]:
get_profit(data_df)
/var/folders/x8/mllw6tg55fs6mvzyfxq6kcwm0000gn/T/ipykernel_22052/1118740994.py:185: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

/var/folders/x8/mllw6tg55fs6mvzyfxq6kcwm0000gn/T/ipykernel_22052/1118740994.py:186: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

/var/folders/x8/mllw6tg55fs6mvzyfxq6kcwm0000gn/T/ipykernel_22052/1118740994.py:187: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

Out[268]:
score true_negatives false_positives false_negatives true_positives example_count accuracy_score recall_score precision_score f1_score log_loss roc_auc precision_recall_auc Model Profit No Model Profit Profit Boost From Model
1 0.865385 638 39 66 37 780 0.865385 0.359223 0.486842 0.413408 0.341163 0.782292 0.439211 Logistic Regression 877.7 373.65 134.898970
1 0.873077 644 33 66 37 780 0.873077 0.359223 0.528571 0.427746 0.337587 0.785131 0.440876 Log Logistic Regression 883.7 373.65 136.504750
1 0.871795 652 25 75 28 780 0.871795 0.271845 0.528302 0.358974 0.365266 0.735842 0.400491 Slim Log Logistic Regression 891.7 373.65 138.645792
1 0.879487 665 12 82 21 780 0.879487 0.203883 0.636364 0.308824 0.335726 0.751846 0.416268 Random Forest 904.7 373.65 142.124983
1 0.871795 657 20 80 23 780 0.871795 0.223301 0.534884 0.315068 0.337559 0.756055 0.402883 XGB 896.7 373.65 139.983942
1 NaN 656 21 81 22 780 0.869231 0.213592 0.511628 0.301370 0.346526 0.734128 0.379118 CatBoost 895.7 373.65 139.716312
1 0.871795 657 20 80 23 780 0.871795 0.223301 0.534884 0.315068 0.337559 0.756055 0.402883 Log XGB 896.7 373.65 139.983942
1 0.888462 670 7 80 23 780 0.888462 0.223301 0.766667 0.345865 NaN NaN NaN EnsV1 909.7 373.65 143.463134
1 0.891026 667 10 75 28 780 0.891026 0.271845 0.736842 0.397163 NaN NaN NaN EnsV4 906.7 373.65 142.660244
In [ ]:
# ensv1 has greater precision on minority class

Findings ¶

  • Back to Top

We stand to make a substantial profit from the implementation of Machine Learning in our direct amrketing campaigns. Opposed to earning $373.65 with no model the company can earn $909.70 using our optimal model! This is 143% profit boost and bring even greater margins for the company's 50K subscribers!

The optimal model was Version 1 Ensembling model. This model acheived an accuracy score of 88% and precision of 76%. The model with the highest accuracy was our Version 4 Ensemling Model but this model had a lower precision and is an example for why metrics should be understood. Precison equals how well a model is prediciting true values out of all true values. We wanted a model that best predicts Purchase and has the lowest number of false positives. Our EnsV1 Model acheives this and would be my recommendation for the company.

Feature Importance¶

image.png

The most important features were somewhat obvious when you think of the business use case. Focus on Customers who:

  • have purchased an Art book in the past
  • have a large total amount purchased
  • have a high number of purchases in a given period

Observing the sumary output of a Logistic Regression model is great when there are statistically significant features and give greate insight into which direction that feature is causing the odds of Purchase to increase or decrease. In our slim model 1 feature could be considered statistically significant after log transforming and resampling the data. This could be experimented with and tested on but considering the Logistic models odds ratios the company should consider customers who:

  • have made their first purchased recently. Odds of purchase decrease for every one unit increase in First_purchase
  • have made a large number of art book purchases in the past
  • have not made a lot of purchases in the chosen period. Odds of purchase decrease by a factor of .5 for every one unit increase in Frequency
In [279]:
#No Values Were Significant so did not include in the analysis
# make data
Xtrain,Xtest,ytrain,ytest = m1.make_data("slim", "after", ['P_Art', 'Amount_purchased','Frequency', 'First_purchase'])
# Xtrain,Xtest,ytrain,ytest = m1.make_data("full", "before", _)

# log transform data
Xtrain, Xtest = m1.log_transform(['Frequency', 'First_purchase'])

print_model = sm.Logit(ytrain,Xtrain).fit()
print(print_model.summary())

# convert log odds inot more interpretabl odds
model_odds = pd.DataFrame(np.exp(print_model.params), columns=['OR'])
model_odds
Optimization terminated successfully.
         Current function value: 0.466430
         Iterations 6
                           Logit Regression Results                           
==============================================================================
Dep. Variable:                 Choice   No. Observations:                 3256
Model:                          Logit   Df Residuals:                     3252
Method:                           MLE   Df Model:                            3
Date:                Tue, 13 Jun 2023   Pseudo R-squ.:                  0.1363
Time:                        18:43:07   Log-Likelihood:                -1518.7
converged:                       True   LL-Null:                       -1758.4
Covariance Type:            nonrobust   LLR p-value:                1.327e-103
====================================================================================
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
P_Art                0.9561      0.069     13.833      0.000       0.821       1.092
Amount_purchased     0.0020      0.000      4.155      0.000       0.001       0.003
Frequency           -0.6888      0.075     -9.168      0.000      -0.836      -0.542
First_purchase      -0.2104      0.073     -2.877      0.004      -0.354      -0.067
====================================================================================
Out[279]:
OR
P_Art 2.601529
Amount_purchased 1.001957
Frequency 0.502203
First_purchase 0.810282

Now lets use this optimal model to create a Marketing List that gives our company an actioanble item they can use!

Marketing List ¶

  • Back to Top

Now let's select some observations that will be likely to purchase our material. This could be used to generate leads for marketers in the company.

In [281]:
Xtrain,Xtest,ytrain,ytest = m1.make_data("full", "before", _)
preds = ensemble_v4_model.predict(Xtest)
In [282]:
df_preds = pd.DataFrame(ytest)
df_preds['ensv4']= preds
df_preds.reset_index(inplace=True)
df_preds.rename(columns={'index':'CustomerId'}, inplace=True) # if we were given customers ids in a real world situation this would be the index. 
df_preds
Out[282]:
CustomerId Choice ensv4
0 284 1 0
1 244 1 0
2 1134 0 0
3 440 0 0
4 820 0 0
... ... ... ...
775 1179 0 0
776 577 0 0
777 1309 0 0
778 760 0 0
779 1277 0 0

780 rows × 3 columns

In [286]:
test = df_preds[df_preds['Choice']==1]
test_final = test[test['ensv4']==1]
test_final
Out[286]:
CustomerId Choice ensv4
39 110 1 1
51 157 1 1
131 141 1 1
150 95 1 1
154 194 1 1
237 186 1 1
256 353 1 1
280 76 1 1
316 200 1 1
347 234 1 1
377 29 1 1
401 254 1 1
409 125 1 1
426 395 1 1
438 96 1 1
446 151 1 1
469 335 1 1
490 118 1 1
506 227 1 1
546 365 1 1
548 193 1 1
554 123 1 1
564 178 1 1
596 343 1 1
597 237 1 1
710 38 1 1
742 33 1 1
750 99 1 1

Now we have 28 customers we can reach out to with a high likelihood they will purchase. This was an inetresting topic to experiment with and I look forward to utilizing this use case again in the future!