import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import json
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.metrics import PrecisionRecallDisplay, precision_recall_curve
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict
from sklearn import metrics
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.ensemble import VotingClassifier
import statsmodels.api as sm
import mlflow
import mlflow.sklearn
from statistics import mean
from mlflow.models.signature import infer_signature
import shap
from catboost import CatBoostClassifier, Pool
class myModel:
def __init__(self, name, model):
self.name=name
self.model=model
def log_transform(self, columns:list):
"""Log transforms columns from dataframe that has been cross validated with a hold out dataset
Args:
columns: A list of strings representing variable names
Returns:
2 train & test dataframes with log transformations applied to applicable columns
"""
for col in columns:
Xtrain[col]=np.log(Xtrain[col])
Xtest[col]=np.log(Xtest[col])
return Xtrain,Xtest
def make_data(self, str1:str, str2:str, str3:str):
"""Performs hold out cross validation on dataset with options for selecting variables
Args:
str1: A string denoting if the function should return a full or selective dataset
str2: A string denoting if the function should perform sampling or not
str3: A list of strings representing variables to select in feature selection
Returns:
4 datasets of train test split data with applicable manipulations dependent upon the arguments given (i.e., feature selection, oversampling, none)
"""
train = pd.read_excel('BBBC-Train.xlsx')
test = pd.read_excel('BBBC-Test.xlsx')
data = pd.concat([test, train])
data.drop('Observation', axis=1, inplace=True)# keeping this column creates highly accurate results about 97%
# seems like overfitting...
if str2 == 'before':
if str1 == 'full':
X = data.drop(['Choice'], axis=1)
y = data['Choice']
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=7)
return Xtrain, Xtest, ytrain, ytest
elif str1 == 'slim':
X = data[str3]
y = data['Choice']
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=7)
return Xtrain, Xtest, ytrain, ytest
else:
print('Error')
elif str2 == 'after':
if str1 == 'full':
X = data.drop(['Choice'], axis=1)
y = data['Choice']
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=7)
# define Pipeline and do random undersampling then SMOTE oversampling of dataset
# [.8, .5, .2] over
under = RandomUnderSampler(sampling_strategy=.2)#.2
over = SMOTE(sampling_strategy=.3) # want about 1500 in minority class and 2000 majority
# define pipeline steps
steps = [('u', under), ('o', over)]
pipeline = Pipeline(steps=steps)
Xtrain, ytrain = pipeline.fit_resample(Xtrain, ytrain)
return Xtrain, Xtest, ytrain, ytest
elif str1 == 'slim':
X = data[str3]
y = data['Choice']
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=7)
# define Pipeline and do random undersampling then SMOTE oversampling of dataset
under = RandomUnderSampler(sampling_strategy=.2)
over = SMOTE(sampling_strategy=.3) # want about 1500 in minority class and 2000 majority
# define pipeline steps
steps = [('u', under), ('o', over)]
pipeline = Pipeline(steps=steps)
Xtrain, ytrain = pipeline.fit_resample(Xtrain, ytrain)
return Xtrain, Xtest, ytrain, ytest
else:
print('Error')
def fit_model(self,model:str,scoring:list,tuning=False, params=list):
"""Fits model and scores with cross validations based on selected metrics
Args:
model: A chosen Ml model for training
scoring: A list of metrics to use for scoring over default 5 folds
tuning: A boolean representing whether hyper parameter tuning should be performed
params: A dictionary of parameters to be tuned for the given model
Returns:
1. fit model
2. A dict holding average metrics over each CV fold
"""
model=model
if tuning == False:
fit_model = model.fit(Xtrain,ytrain)
scores = cross_validate(model, Xtest, ytest, scoring=scoring)
# print(f"Average Test Accuracy Score: {scores['test_accuracy'].mean()}")
return fit_model, scores
elif tuning == True:
model = GridSearchCV(model, params, cv=5)
model.fit(Xtrain,ytrain)
fit_model = model.best_estimator_
scores = cross_validate(model, Xtest, ytest, scoring=scoring)
# print(f"Average Test Accuracy Score: {scores['test_accuracy'].mean()}")
return fit_model, scores
def get_profit(df):
# profit = revenue - cost
#cost per book
mailing_cost = 0.65
manuf_cost = 15
overhead_cost = manuf_cost * .45
#revenue
selling_price = 31.95
all_cust = 50000
# profit per book
prof_book = (selling_price - mailing_cost - manuf_cost - overhead_cost)
n_cust = 103 # customers who purchased in test set
total_cust = 780 # total custoemrs in test set
pct_buying = .132
cost_nomodel = mailing_cost*all_cust
resp_nomodel = pct_buying*all_cust
# create empty columns
df['Profit'] = 0
df['No Model Profit'] = 0
df['Profit Boost From Model'] = 0
for i in df.index:
not_sold_by_model = df['false_negatives'][i] # false negatives
sold_by_model = df['true_positives'][i] # true positives
missed_opportunities = df['false_positives'][i] # false positives
total_mailed = not_sold_by_model + sold_by_model
no_model_total = n_cust
total_cust = 780
# subtracting mailing expenses and calculating profit for total costs predicted by model
# penalizes for false positives
profit = (((not_sold_by_model + sold_by_model) * prof_book)- missed_opportunities) - (total_mailed * mailing_cost)
# uncomment for original
no_model_profit = ((no_model_total * prof_book)-no_model_total) - (total_cust * mailing_cost)
df['Profit'][i] = profit
df['No Model Profit'][i] = no_model_profit
df['Profit Boost From Model'][i] = ((profit - no_model_profit)/no_model_profit)*100
return df
def calc_metrics2(models):
models = models[['Model','accuracy_score', 'precision_score','f1_score', 'recall_score']]
pd.melt(models, id_vars=['Model'], value_vars=['accuracy_score', 'precision_score', 'f1_score', 'recall_score'])
fig = px.bar(models, x="Model", y="value",
color = "variable", barmode="group")
fig.show()
def json_to_df(json_file, name):
# store json file to dictionary
with open(json_file) as file:
data = json.load(file)
data = pd.DataFrame(data, index=[x for x in range(1, len(data))])
data['Model'] = name
return data
In 1994, about 50,000 new titles were published in the US each year, giving rise to a $20B book publishing industry. About 10% of these books are sold through mail order. Recently, online superstores such as Amazon have emerged, carrying 1-2.5M titles and further intensifying the pressure on book clubs and mail order firms. In response to these pressures, book clubs are starting to look at alternative business models that will make them more responsive to their customer’s preferences.
The BBBC, Bookbinders Book Club, was established in 1986 for the purpose of selling specialty books through direct marketing. BBBC is strictly a distributor and does not publish any of the books it sells. In anticipation of using database marketing, BBBC made a strategic decision right from the start to build and maintain a detailed database about its members containing all relevant information about them. Readers fill out an insert and return it to BBBC which then enters the data into the database.
BBBC is exploring whether to use predictive modeling approaches to improve the efficacy of its direct mail program. For this analysis, we will use a subset of the database available to BBBC. The dependent variable (i.e., response group) for the analysis is Choice
– purchase or no purchase of the book. BBBC also selected several independent variables that it thought might explain the observed choice behavior. The variables are:
Name | Type | Description |
---|---|---|
Choice | int | If the customer purchased the The Art History of Florence. 1 = purchase and 0 = non-purchase |
Gender | int | 0 = Female and 1 = Male |
Amount_purchased | int | Total money spent on BBBC books |
Frequency | int | Total number of purchases in the chosen period |
Last_purchase | int | Months since last purchase |
First_purchase | int | Months since first purchase |
P_Child | int | Number of children’s books purchased |
P_Youth | int | Number of youth’s books purchased |
P_Cook | int | Number of cookbooks purchased |
P_DIY | int | Number of do-it-yourself books purchased |
P_Art | int | Number of art books purchased |
Problem Type:
This is a supervised classficiation problem using structured data since we are predicting a binary response 0
or 1
for no purchase or purchase, respectivley
Business Goals/Requirements:
To avoid data leakage I perform hold out cross valdiation techniques before exploring the data:
model = LogisticRegression(random_state=7, max_iter=1000)
m1 = myModel('EDA',model)
# make data
x_train, x_test, y_train, y_test = m1.make_data("full", "before", _)
x_train.head()
Gender | Amount_purchased | Frequency | Last_purchase | First_purchase | P_Child | P_Youth | P_Cook | P_DIY | P_Art | |
---|---|---|---|---|---|---|---|---|---|---|
576 | 1 | 210 | 12 | 1 | 12 | 0 | 0 | 0 | 1 | 0 |
1522 | 0 | 34 | 4 | 1 | 4 | 0 | 0 | 1 | 0 | 0 |
465 | 1 | 303 | 2 | 2 | 6 | 0 | 0 | 1 | 0 | 0 |
488 | 1 | 266 | 34 | 2 | 40 | 0 | 0 | 0 | 0 | 1 |
451 | 0 | 340 | 14 | 4 | 26 | 3 | 0 | 1 | 0 | 0 |
# how many observations will we be training with?
x_train.shape
(3120, 10)
# is response variable balanced?
fig, axs = plt.subplots(2)
fig.suptitle('Observing Imbalanced Data')
axs[0].hist(y_train)
axs[1].hist(y_test)
(array([677., 0., 0., 0., 0., 0., 0., 0., 0., 103.]), array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ]), <BarContainer object of 10 artists>)
The response variable is highly imbalanced. Leaving data in its current state could lead to a model that is a poor predictor for the minority class 1
which is our focus in this modeling process. One of our goals is to best predict customers who will purchase and a model that predicts otherwise will be ineffective. There are many techniques to overcome this, I will attemmpt 2 and observe the results:
Modeling Approaches: This is a supervised classification problem so a few modeling approaches come to mind:
Let's start exploring the basics of the data...
# what kind of data types in train and test
print("Training:")
print(x_train.info())
print()
print("Testing:")
print(x_test.info())
Training: <class 'pandas.core.frame.DataFrame'> Int64Index: 3120 entries, 576 to 175 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Gender 3120 non-null int64 1 Amount_purchased 3120 non-null int64 2 Frequency 3120 non-null int64 3 Last_purchase 3120 non-null int64 4 First_purchase 3120 non-null int64 5 P_Child 3120 non-null int64 6 P_Youth 3120 non-null int64 7 P_Cook 3120 non-null int64 8 P_DIY 3120 non-null int64 9 P_Art 3120 non-null int64 dtypes: int64(10) memory usage: 268.1 KB None Testing: <class 'pandas.core.frame.DataFrame'> Int64Index: 780 entries, 284 to 1277 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Gender 780 non-null int64 1 Amount_purchased 780 non-null int64 2 Frequency 780 non-null int64 3 Last_purchase 780 non-null int64 4 First_purchase 780 non-null int64 5 P_Child 780 non-null int64 6 P_Youth 780 non-null int64 7 P_Cook 780 non-null int64 8 P_DIY 780 non-null int64 9 P_Art 780 non-null int64 dtypes: int64(10) memory usage: 67.0 KB None
We observe a few things here:
integer
Let's use describe()
to see some statistical metrics:
x_train.describe()
Gender | Amount_purchased | Frequency | Last_purchase | First_purchase | P_Child | P_Youth | P_Cook | P_DIY | P_Art | |
---|---|---|---|---|---|---|---|---|---|---|
count | 3120.000000 | 3120.000000 | 3120.000000 | 3120.000000 | 3120.000000 | 3120.000000 | 3120.000000 | 3120.000000 | 3120.000000 | 3120.000000 |
mean | 0.670192 | 196.717949 | 12.815385 | 3.106410 | 22.627564 | 0.725000 | 0.347115 | 0.771795 | 0.394231 | 0.372436 |
std | 0.470219 | 94.960142 | 8.052841 | 2.942795 | 15.906823 | 1.018906 | 0.644874 | 1.032576 | 0.692815 | 0.675507 |
min | 0.000000 | 15.000000 | 2.000000 | 1.000000 | 2.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 0.000000 | 122.000000 | 6.000000 | 1.000000 | 12.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
50% | 1.000000 | 199.000000 | 12.000000 | 2.000000 | 18.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
75% | 1.000000 | 268.000000 | 16.000000 | 4.000000 | 30.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
max | 1.000000 | 474.000000 | 36.000000 | 12.000000 | 96.000000 | 8.000000 | 5.000000 | 6.000000 | 4.000000 | 5.000000 |
Doesn't appear to be any awkwardness to the data meaning theres no negative minimums for Amount_purchased
. If I was modeling in R I may convert Gender to a Factor but will leave the variable as is in Python since most libraries/algorithms can handle binary variables directly without having to convert.
fig, axs = plt.subplots(2,5, figsize = (20,10))
fig.suptitle('Observing Normality in Data')
axs[0][0].hist(x_train['Gender'])
axs[0][1].hist(x_train['Amount_purchased'])
axs[0][2].hist(x_train['Frequency'])
axs[0][3].hist(x_train['Last_purchase'])
axs[0][4].hist(x_train['First_purchase'])
axs[1][0].hist(x_train['P_Child'])
axs[1][1].hist(x_train['P_Youth'])
axs[1][2].hist(x_train['P_Cook'])
axs[1][3].hist(x_train['P_DIY'])
axs[1][4].hist(x_train['P_Art'])
#define subplot titles
axs[0, 0].set_title('Gender')
axs[0, 1].set_title('Amount_purchased')
axs[0, 2].set_title('Frequency')
axs[0, 3].set_title('Last_purchase')
axs[0, 4].set_title('First_purchase')
axs[1, 0].set_title('P_Child')
axs[1, 1].set_title('P_Youth')
axs[1, 2].set_title('P_Cook')
axs[1, 3].set_title('P_DIY')
axs[1, 4].set_title('P_Art')
Text(0.5, 1.0, 'P_Art')
Disregarding Gender
, All variables appears slightly right skewed. We could do a log transformation and see if that recenters (normalizes) the data. The log transformation does not work in the range of 0 to 6 because the logarithm function is undefined for values less than or equal to zero. We have many values at zero for multiple variables (i.e., P_Cook
). We will not log transform these variables as removing this data would be removing the majority
# log transform data
log_xtrain = np.log(x_train)
fig, axs = plt.subplots(1,4, figsize = (20,10))
fig.suptitle('Observing Normality in Data')
#axs[0][0].hist(log_xtrain['Gender'])
axs[0].hist(log_xtrain['Amount_purchased'])
axs[1].hist(log_xtrain['Frequency'])
axs[2].hist(log_xtrain['Last_purchase'])
axs[3].hist(log_xtrain['First_purchase'])
#axs[1][0].hist(log_xtrain['P_Child'])
# axs[1][1].hist(log_xtrain['P_Youth'])
# axs[1][2].hist(log_xtrain['P_Cook'])
# axs[1][3].hist(log_xtrain['P_DIY'])
# axs[1][4].hist(log_xtrain['P_Art'])
#define subplot titles
#axs[0, 0].set_title('Gender')
axs[0].set_title('Amount_purchased')
axs[1].set_title('Frequency')
axs[2].set_title('Last_purchase')
axs[3].set_title('First_purchase')
# axs[1, 0].set_title('P_Child')
# axs[1, 1].set_title('P_Youth')
# axs[1, 2].set_title('P_Cook')
# axs[1, 3].set_title('P_DIY')
# axs[1, 4].set_title('P_Art')
divide by zero encountered in log
Text(0.5, 1.0, 'First_purchase')
No Transformation Will Be Applied:
Amount_purchased
becomes less normalized so we will not transform this variable. Last_purchase
is outweighted by the high volume of users sitting around 0 the trasnformation did not help so we will not transform this variable Transformation Applied:
Frequency
andFirst_purchase
did benefit from the transformation and become more normally distirbutedNext I will look for any relationships between dependent variables and their relationships with the independent. We will observe a pairplot but want to include the response so we will need to read in the data again
train0 = pd.read_excel('BBBC-Train.xlsx')
test0 = pd.read_excel('BBBC-Test.xlsx')
data0 = pd.concat([test0, train0])
#data0.drop('Observation', axis=1, inplace=True)
plt.figure(figsize=(1,5))
sns.pairplot(data0)
plt.show()
<Figure size 100x500 with 0 Axes>
The clearest relationship is with First_purchase
and Amount_purchased
. The more months since first purchase the more the customer is likely to spend. Let's plot a heatmap of the same relationships
plt.figure(figsize=(12, 7))
sns.heatmap(data0.corr(), annot = True, vmin=0, vmax=1)
plt.show()
Multicollinearity is when two or more independent variables in a regression model are highly correlated with each other. This can be a problem because it can make it difficult to identify the unique effects of each variable on the dependent variable. Additionally, because the independent variables are correlated, the coefficients of the independent variables in the regression equation can change a lot when a new data point is added or when a data point is removed, making the model unstable and unreliable. In this study any correlation over 75% is considered highly correleated
Things to consider from heatmap:
Choice
there is a small relationship with P_Art
stating whether the customer purchased art books in the past. Thsi makes sense as the marketing marterial is for an art book.First_purchase
and Last_purchase
also remembering that First_purchase
was more normally distributedLast_purchase
and P_Child
as well as Last_purchase
and P_Cook
. This model may benefit from removing Last_purchase
variable since it is so correlated with multiple variablesObservation
. This appears to be an id column that denotes individual customers. How does that indicate purchase?Let's rethink what the variables represent to identify applications for feature engineering:
Name | Type | Description |
---|---|---|
Choice | int | If the customer purchased the The Art History of Florence. 1 = purchase and 0 = non-purchase |
Gender | int | 0 = Female and 1 = Male |
Amount_purchased | int | Total money spent on BBBC books |
Frequency | int | Total number of purchases in the chosen period |
Last_purchase | int | Months since last purchase |
First_purchase | int | Months since first purchase |
P_Child | int | Number of children’s books purchased |
P_Youth | int | Number of youth’s books purchased |
P_Cook | int | Number of cookbooks purchased |
P_DIY | int | Number of do-it-yourself books purchased |
P_Art | int | Number of art books purchased |
Frequency
variable more and define what it means to be "within the chosen period". Is that within the sampling period, within a given month?Frequency
that would be more telling. NOTE: I originally modeled without the
Observation
variable and obtained much lower merics like the logistic regression shown below.
TODO: Explain metrics: Precision is low for the range of Recall due to the data imbalance and PR curve is outputting an L shape. ROC curve is nto robust to data imbalance and swings wildly so is not completely reliable
After tracking and comparing runs in MlFlow we see there are differences with sampling. It is much better at placing true positives but also has many more false positives
Logit Model Before Sampling
Logit Model After Sampling
As shown in the confusion matrices the model has gotten better at predicting our minority class after resampling
# make model
model = LogisticRegression(random_state=7, max_iter=1000)
m1 = myModel('Logistic Regression',model)
# make data
Xtrain,Xtest,ytrain,ytest = m1.make_data("full", "before", _)
print(ytrain.hist())
# make model
logit_model, logit_scores = m1.fit_model(model, ['f1','accuracy'], tuning=False)
# make predictions
logit_preds = m1.make_preds(logit_model)
# make roc plots
logit_df = m1.make_roc(logit_preds)
# make classification report
logit_rpt = m1.make_report(logit_preds, logit_df)
# make confusion matrix
logit_cm = m1.make_cm(logit_preds, logit_model, logit_rpt)
Axes(0.125,0.11;0.775x0.77) Average Test Accuracy Score: 0.8833333333333332
# make model
model = LogisticRegression(random_state=7, max_iter=1000)
m1 = myModel('Logistic Regression',model)
# make data
Xtrain,Xtest,ytrain,ytest = m1.make_data("full", "after", _)
print(ytrain.hist())
# make model
logit_model, logit_scores = m1.fit_model(model, ['f1','accuracy'], tuning=False)
# make predictions
logit_preds = m1.make_preds(logit_model)
# make roc plots
logit_df = m1.make_roc(logit_preds)
# make classification report
logit_rpt = m1.make_report(logit_preds, logit_df)
# make confusion matrix
logit_cm = m1.make_cm(logit_preds, logit_model, logit_rpt)
Axes(0.125,0.11;0.775x0.77) Average Test Accuracy Score: 0.8833333333333332
All of my modeling was tracked using MlFlow, an open-source tool to easily track model runs, parameters, and metrics as yoy iterate through the modeling process. The most optimal models were selected based on a combination of:
mlflow.set_tracking_uri("")
mlflow.set_experiment(experiment_name="BBBC_modeling")
with mlflow.start_run(run_name='F_b_logit'):
model = LogisticRegression(random_state=7, max_iter=1000)
m1 = myModel('Logistic Regression',model)
# make data
Xtrain,Xtest,ytrain,ytest = m1.make_data("full", "before", _)
print("Data Loaded")
print("Building Model...")
# make model
logit_model,logit_scores = m1.fit_model(model, ['f1','accuracy', 'precision', 'recall'], tuning=False)
#signature?
signature = infer_signature(Xtest, logit_model.predict(Xtest))
#Build the Evaluation Dataset from the test set
eval_data = Xtest
eval_data["label"] = ytest
undersamp = .2
oversamp = .3
# log metrics
mlflow.log_metric('accuracy', mean(logit_scores['test_accuracy']))
mlflow.log_metric('f1', mean(logit_scores['test_f1']))
mlflow.log_metric('precision', mean(logit_scores['test_precision']))
mlflow.log_metric('recall', mean(logit_scores['test_recall']))
mlflow.log_metric('oversample', oversamp)
mlflow.log_metric('undersample', undersamp)
# log model
mlflow.sklearn.log_model(logit_model, "logit model")
model_uri = mlflow.get_artifact_uri("logit model")
print("Model Loaded")
# evaluate model
result = mlflow.evaluate(
model_uri,
eval_data,
targets="label",
model_type="classifier",
evaluators=["default"],
)
mlflow.end_run()
Data Loaded Average Test Accuracy Score: 0.8833333333333332
/Users/coolkid/anaconda3/lib/python3.10/site-packages/mlflow/models/signature.py:137: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details. 2023/06/13 16:11:02 WARNING mlflow.pyfunc: Detected one or more mismatches between the model's dependencies and the current Python environment: - mlflow (current: 2.3.2, required: mlflow==2.3) To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file. 2023/06/13 16:11:02 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator. 2023/06/13 16:11:02 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
Model Loaded
2023/06/13 16:11:02 INFO mlflow.models.evaluation.default_evaluator: Shap explainer Linear is used. /Users/coolkid/anaconda3/lib/python3.10/site-packages/mlflow/shap.py:459: UserWarning: Unable to serialize underlying model using MLflow, will use SHAP serialization 2023/06/13 16:11:02 WARNING mlflow.models.evaluation.default_evaluator: Logging explainer failed. Reason: AttributeError("'LogisticRegression' object has no attribute 'save'"). Set logging level to DEBUG to see the full traceback. /Users/coolkid/anaconda3/lib/python3.10/site-packages/shap/plots/_beeswarm.py:340: UserWarning: No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored /Users/coolkid/anaconda3/lib/python3.10/site-packages/shap/plots/_beeswarm.py:664: UserWarning: No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored
# no significant improvment with log transformation
mlflow.set_tracking_uri("")
mlflow.set_experiment(experiment_name="BBBC_modeling")
with mlflow.start_run(run_name='b_rf_final'):
model = RandomForestClassifier(random_state=7)
m1 = myModel('Tuned Random Forest',model)
# make data
'''
Random forest is an ideal algorithm to deal with the extreme imbalance owing to two main reasons.
Firstly, the ability to incorporate class weights into the random forest classifier makes it cost-sensitive;
hence it penalizes misclassifying the minority class.
'''
Xtrain,Xtest,ytrain,ytest = m1.make_data("full", "before", _)
print("Data Loaded")
print("Building Model...")
# make model
params = {'n_estimators': [25,50,100,200,500],
'max_depth': [2,5,8, 50]}
rf_model, rf_scores = m1.fit_model(model, ['f1','accuracy', 'precision', 'recall'], tuning=True, params=params)
#signature?
signature = infer_signature(Xtest, rf_model.predict(Xtest))
#Build the Evaluation Dataset from the test set
eval_data = Xtest
eval_data["label"] = ytest
undersamp = 0
oversamp = 0
rf_params = rf_model.get_params()
# # log parameters
mlflow.log_param('n_estimators', rf_params['n_estimators'])
mlflow.log_param('max_depth', rf_params['max_depth'])
# log metrics
mlflow.log_metric('accuracy', mean(rf_scores['test_accuracy']))
mlflow.log_metric('f1', mean(rf_scores['test_f1']))
mlflow.log_metric('precision', mean(rf_scores['test_precision']))
mlflow.log_metric('recall', mean(rf_scores['test_recall']))
mlflow.log_metric('oversample', oversamp)
mlflow.log_metric('undersample', undersamp)
# log model
mlflow.sklearn.log_model(rf_model, "log rf model")
model_uri = mlflow.get_artifact_uri("log rf model")
print("Model Loaded")
explainer = shap.Explainer(rf_model, Xtest)
shap_values = explainer(Xtest)
shap_exp = shap_values
#evaluate model
result = mlflow.evaluate(
model_uri,
eval_data,
targets="label",
model_type="classifier",
evaluators="default",
evaluator_config= {"log_model_explainability":False} # error called when creating beeswarm plot,
)
# save results in json
result.save("")
print("Metrics Saved")
mlflow.end_run()
Data Loaded Building Model... Average Test Accuracy Score: 0.8705128205128204
/Users/coolkid/anaconda3/lib/python3.10/site-packages/mlflow/models/signature.py:137: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.
Model Loaded
98%|===================| 1534/1560 [00:21<00:00] 2023/06/13 16:15:27 WARNING mlflow.pyfunc: Detected one or more mismatches between the model's dependencies and the current Python environment: - mlflow (current: 2.3.2, required: mlflow==2.3) To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file. 2023/06/13 16:15:27 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator. 2023/06/13 16:15:27 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
Metrics Saved
<Figure size 1050x700 with 0 Axes>
# no improvement with log transformattion
mlflow.set_tracking_uri("")
mlflow.set_experiment(experiment_name="BBBC_modeling")
with mlflow.start_run(run_name='b_xgb_final'):
model = xgb.XGBClassifier(random_state=7,
eta=.2,
max_depth=3,
gamma=1,
subsample=1)
m1 = myModel('Tuned XGB Model',model)
# make data
Xtrain,Xtest,ytrain,ytest = m1.make_data("full", "before", _)
print("Data Loaded")
print("Building Model...")
# make model
params = {'eta' : [0.01, 0.2],
'max_depth': [3],
'gamma': [0 , .1],
'subsample': [0.5, 1]}
tuned_xgb_model, tuned_xgb_scores = m1.fit_model(model, ['f1','accuracy', 'precision', 'recall'], tuning=False, params=params)
#signature?
signature = infer_signature(Xtest, tuned_xgb_model.predict(Xtest))
#Build the Evaluation Dataset from the test set
eval_data = Xtest
eval_data["label"] = ytest
undersample = 0
oversample = 0
xgb_params = tuned_xgb_model.get_params()
# log parameters
mlflow.log_param('eta', xgb_params['eta'])
mlflow.log_param('max_depth', xgb_params['max_depth'])
mlflow.log_param('gamma', xgb_params['gamma'])
mlflow.log_param('subsample', xgb_params['subsample'])
# log metrics
mlflow.log_metric('accuracy', mean(tuned_xgb_scores['test_accuracy']))
mlflow.log_metric('f1', mean(tuned_xgb_scores['test_f1']))
mlflow.log_metric('precision', mean(tuned_xgb_scores['test_precision']))
mlflow.log_metric('recall', mean(tuned_xgb_scores['test_recall']))
mlflow.log_metric('oversample', oversample)
mlflow.log_metric('undersample', undersample)
# log model
mlflow.sklearn.log_model(tuned_xgb_model, "xgb model")
model_uri = mlflow.get_artifact_uri("xgb model")
print("Model Loaded")
# evaluate model
result = mlflow.evaluate(
model_uri,
eval_data,
targets="label",
model_type="classifier",
evaluators=["default"],
)
# save results in json
result.save("")
print("Metrics Saved")
mlflow.end_run()
Data Loaded Building Model... Average Test Accuracy Score: 0.8743589743589745
/Users/coolkid/anaconda3/lib/python3.10/site-packages/mlflow/models/signature.py:137: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details. 2023/06/13 16:16:02 WARNING mlflow.pyfunc: Detected one or more mismatches between the model's dependencies and the current Python environment: - mlflow (current: 2.3.2, required: mlflow==2.3) To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file. 2023/06/13 16:16:02 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator. 2023/06/13 16:16:02 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
Model Loaded
2023/06/13 16:16:02 INFO mlflow.models.evaluation.default_evaluator: Shap explainer Tree is used. /Users/coolkid/anaconda3/lib/python3.10/site-packages/mlflow/shap.py:459: UserWarning: Unable to serialize underlying model using MLflow, will use SHAP serialization 2023/06/13 16:16:03 WARNING mlflow.models.evaluation.default_evaluator: Logging explainer failed. Reason: AttributeError("'TreeEnsemble' object has no attribute 'save'"). Set logging level to DEBUG to see the full traceback. /Users/coolkid/anaconda3/lib/python3.10/site-packages/shap/plots/_beeswarm.py:340: UserWarning: No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored /Users/coolkid/anaconda3/lib/python3.10/site-packages/shap/plots/_beeswarm.py:664: UserWarning: No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored
Metrics Saved
# catboost_model.get_all_params()
# catboost_model.get_params()
# no significant difference after log transformation
mlflow.set_tracking_uri("")
mlflow.set_experiment(experiment_name="BBBC_modeling")
with mlflow.start_run(run_name='b_catboost_final'):
params = {'iterations':[2,4, 8, 20, 50],
'depth': [6,7,8,9]}
model = CatBoostClassifier(bagging_temperature=0,
learning_rate=1,
loss_function='Logloss',
verbose=False,
random_state=7)
m1 = myModel('CatBoost Model',model)
# make data
Xtrain,Xtest,ytrain,ytest = m1.make_data("full", "before", _)
print("Data Loaded")
print("Building Model...")
catboost_model, catboost_scores = m1.fit_model(model, ['f1','accuracy', 'precision', 'recall'], tuning=True, params=params)
catboost_params = catboost_model.get_params()
#signature?
signature = infer_signature(Xtest, catboost_model.predict(Xtest))
#Build the Evaluation Dataset from the test set
eval_data = Xtest
eval_data["label"] = ytest
undersample = 0
oversample = 0
# log parameters
mlflow.log_param('iterations', catboost_params['iterations'])
mlflow.log_param('learning_rate', catboost_params['learning_rate'])
mlflow.log_param('depth', catboost_params['depth'])
mlflow.log_param('bagging_temperature', catboost_params['bagging_temperature'])
mlflow.log_param('loss_function', catboost_params['loss_function'])
# log metrics
mlflow.log_metric('accuracy', mean(catboost_scores['test_accuracy']))
mlflow.log_metric('f1', mean(catboost_scores['test_f1']))
mlflow.log_metric('precision', mean(catboost_scores['test_precision']))
mlflow.log_metric('recall', mean(catboost_scores['test_recall']))
mlflow.log_metric('oversample', oversample)
mlflow.log_metric('undersample', undersample)
# log model
mlflow.sklearn.log_model(catboost_model, "catboost model")
model_uri = mlflow.get_artifact_uri("catboost model")
print("Model Loaded")
# evaluate model
result = mlflow.evaluate(
model_uri,
eval_data,
targets="label",
model_type="classifier",
evaluators=["default"],
)
# save results in json
result.save("")
print("Metrics Saved")
mlflow.end_run()
Data Loaded Building Model... Average Test Accuracy Score: 0.8743589743589745
/Users/coolkid/anaconda3/lib/python3.10/site-packages/mlflow/models/signature.py:137: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details. 2023/06/13 16:17:13 WARNING mlflow.pyfunc: Detected one or more mismatches between the model's dependencies and the current Python environment: - mlflow (current: 2.3.2, required: mlflow==2.3) To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file. 2023/06/13 16:17:13 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator. 2023/06/13 16:17:13 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0. 2023/06/13 16:17:13 WARNING mlflow.models.evaluation.default_evaluator: Computing sklearn model score failed: TypeError("CatBoostClassifier.score() got an unexpected keyword argument 'sample_weight'"). Set logging level to DEBUG to see the full traceback.
Model Loaded
2023/06/13 16:17:14 INFO mlflow.models.evaluation.default_evaluator: Shap explainer Tree is used. /Users/coolkid/anaconda3/lib/python3.10/site-packages/mlflow/shap.py:459: UserWarning: Unable to serialize underlying model using MLflow, will use SHAP serialization 2023/06/13 16:17:14 WARNING mlflow.models.evaluation.default_evaluator: Logging explainer failed. Reason: AttributeError("'TreeEnsemble' object has no attribute 'save'"). Set logging level to DEBUG to see the full traceback. /Users/coolkid/anaconda3/lib/python3.10/site-packages/shap/plots/_beeswarm.py:340: UserWarning: No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored /Users/coolkid/anaconda3/lib/python3.10/site-packages/shap/plots/_beeswarm.py:664: UserWarning: No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored
Metrics Saved
# significant improvement after log trasnformation and resampling
mlflow.set_tracking_uri("")
mlflow.set_experiment(experiment_name="BBBC_modeling")
with mlflow.start_run(run_name='a_log_logit_final'):
model = LogisticRegression(random_state=7, max_iter=1000)
m1 = myModel('Log Logistic Regression',model)
# make data
Xtrain,Xtest,ytrain,ytest = m1.make_data("full", "after", _)
print("Data Loaded")
print("Building Model...")
# log transform data
Xtrain, Xtest = m1.log_transform(['Frequency', 'First_purchase'])
# make model
log_logit_model,log_logit_scores = m1.fit_model(model, ['f1','accuracy', 'precision', 'recall'], tuning=False)
#signature?
signature = infer_signature(Xtest, log_logit_model.predict(Xtest))
#Build the Evaluation Dataset from the test set
eval_data = Xtest
eval_data["label"] = ytest
undersamp = .2
oversamp = .3
# log metrics
mlflow.log_metric('accuracy', mean(log_logit_scores['test_accuracy']))
mlflow.log_metric('f1', mean(log_logit_scores['test_f1']))
mlflow.log_metric('precision', mean(log_logit_scores['test_precision']))
mlflow.log_metric('recall', mean(log_logit_scores['test_recall']))
mlflow.log_metric('oversample', oversamp)
mlflow.log_metric('undersample', undersamp)
# log model
mlflow.sklearn.log_model(log_logit_model, "log_logit model after002003")
model_uri = mlflow.get_artifact_uri("log_logit model after002003")
print("Model Loaded")
# evaluate model
result = mlflow.evaluate(
model_uri,
eval_data,
targets="label",
model_type="classifier",
evaluators=["default"],
)
# save results in json
result.save("")
print("Metrics Saved")
mlflow.end_run()
Data Loaded Building Model... Average Test Accuracy Score: 0.8923076923076924
/Users/coolkid/anaconda3/lib/python3.10/site-packages/mlflow/models/signature.py:137: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details. 2023/06/13 16:21:46 WARNING mlflow.pyfunc: Detected one or more mismatches between the model's dependencies and the current Python environment: - mlflow (current: 2.3.2, required: mlflow==2.3) To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file. 2023/06/13 16:21:46 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator. 2023/06/13 16:21:46 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
Model Loaded
2023/06/13 16:21:47 INFO mlflow.models.evaluation.default_evaluator: Shap explainer Linear is used. /Users/coolkid/anaconda3/lib/python3.10/site-packages/mlflow/shap.py:459: UserWarning: Unable to serialize underlying model using MLflow, will use SHAP serialization 2023/06/13 16:21:47 WARNING mlflow.models.evaluation.default_evaluator: Logging explainer failed. Reason: AttributeError("'LogisticRegression' object has no attribute 'save'"). Set logging level to DEBUG to see the full traceback. /Users/coolkid/anaconda3/lib/python3.10/site-packages/shap/plots/_beeswarm.py:340: UserWarning: No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored /Users/coolkid/anaconda3/lib/python3.10/site-packages/shap/plots/_beeswarm.py:664: UserWarning: No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored
Metrics Saved
# no significant improvment with log transformation
mlflow.set_tracking_uri("")
mlflow.set_experiment(experiment_name="BBBC_modeling")
with mlflow.start_run(run_name='b_log_rf_final'):
model = RandomForestClassifier(random_state=7)
m1 = myModel('Log Random Forest',model)
# make data
'''
Random forest is an ideal algorithm to deal with the extreme imbalance owing to two main reasons.
Firstly, the ability to incorporate class weights into the random forest classifier makes it cost-sensitive;
hence it penalizes misclassifying the minority class.
'''
Xtrain,Xtest,ytrain,ytest = m1.make_data("full", "before", _)
print("Data Loaded")
print("Building Model...")
# log transform data
Xtrain, Xtest = m1.log_transform(['Frequency', 'First_purchase'])
# make model
params = {'n_estimators': [25,50,100,200,500],
'max_depth': [2,5,8, 50]}
log_rf_model, log_rf_scores = m1.fit_model(model, ['f1','accuracy', 'precision', 'recall'], tuning=True, params=params)
#signature?
signature = infer_signature(Xtest, log_rf_model.predict(Xtest))
#Build the Evaluation Dataset from the test set
eval_data = Xtest
eval_data["label"] = ytest
undersamp = 0
oversamp = 0
log_rf_params = log_rf_model.get_params()
# # log parameters
mlflow.log_param('n_estimators', log_rf_params['n_estimators'])
mlflow.log_param('max_depth', log_rf_params['max_depth'])
# log metrics
mlflow.log_metric('accuracy', mean(log_rf_scores['test_accuracy']))
mlflow.log_metric('f1', mean(log_rf_scores['test_f1']))
mlflow.log_metric('precision', mean(log_rf_scores['test_precision']))
mlflow.log_metric('recall', mean(log_rf_scores['test_recall']))
mlflow.log_metric('oversample', oversamp)
mlflow.log_metric('undersample', undersamp)
# log model
mlflow.sklearn.log_model(log_rf_model, "log rf model")
model_uri = mlflow.get_artifact_uri("log rf model")
print("Model Loaded")
explainer = shap.Explainer(log_rf_model, Xtest)
shap_values = explainer(Xtest)
shap_exp = shap_values
#evaluate model
result = mlflow.evaluate(
model_uri,
eval_data,
targets="label",
model_type="classifier",
evaluators="default",
evaluator_config= {"log_model_explainability":False} # error called when creating beeswarm plot,
)
# save results in json
result.save("")
print("Metrics Saved")
mlflow.end_run()
Data Loaded Average Test Accuracy Score: 0.8743589743589745
Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.
Model Loaded
2023/06/13 14:37:09 WARNING mlflow.pyfunc: Detected one or more mismatches between the model's dependencies and the current Python environment: - mlflow (current: 2.3.2, required: mlflow==2.3) To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file. 2023/06/13 14:37:09 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator. 2023/06/13 14:37:09 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
<Figure size 1050x700 with 0 Axes>
# no improvement with log transformattion
mlflow.set_tracking_uri("")
mlflow.set_experiment(experiment_name="BBBC_modeling")
with mlflow.start_run(run_name='b_log_xgb_final'):
model = xgb.XGBClassifier(random_state=7,
eta=.2,
max_depth=3,
gamma=1,
subsample=1)
m1 = myModel('Log XGB Model',model)
# make data
Xtrain,Xtest,ytrain,ytest = m1.make_data("full", "before", _)
print("Data Loaded")
print("Building Model...")
# log transform data
Xtrain, Xtest = m1.log_transform(['Frequency', 'First_purchase'])
# make model
params = {'eta' : [0.2],
'max_depth': [3],
'gamma': [0 , .1],
'subsample': [1]}
log_tuned_xgb_model, log_tuned_xgb_scores = m1.fit_model(model, ['f1','accuracy', 'precision', 'recall'], tuning=False, params=params)
#signature?
signature = infer_signature(Xtest, log_tuned_xgb_model.predict(Xtest))
#Build the Evaluation Dataset from the test set
eval_data = Xtest
eval_data["label"] = ytest
undersample = 0
oversample = 0
log_xgb_params = log_tuned_xgb_model.get_params()
# log parameters
mlflow.log_param('eta', log_xgb_params['eta'])
mlflow.log_param('max_depth', log_xgb_params['max_depth'])
mlflow.log_param('gamma', log_xgb_params['gamma'])
mlflow.log_param('subsample', log_xgb_params['subsample'])
# log metrics
mlflow.log_metric('accuracy', mean(log_tuned_xgb_scores['test_accuracy']))
mlflow.log_metric('f1', mean(log_tuned_xgb_scores['test_f1']))
mlflow.log_metric('precision', mean(log_tuned_xgb_scores['test_precision']))
mlflow.log_metric('recall', mean(log_tuned_xgb_scores['test_recall']))
mlflow.log_metric('oversample', oversample)
mlflow.log_metric('undersample', undersample)
# log model
mlflow.sklearn.log_model(log_tuned_xgb_model, "xgb model")
model_uri = mlflow.get_artifact_uri("xgb model")
print("Model Loaded")
# evaluate model
result = mlflow.evaluate(
model_uri,
eval_data,
targets="label",
model_type="classifier",
evaluators=["default"],
)
# save results in json
result.save("")
print("Metrics Saved")
mlflow.end_run()
Data Loaded Building Model... Average Test Accuracy Score: 0.8743589743589745
/Users/coolkid/anaconda3/lib/python3.10/site-packages/mlflow/models/signature.py:137: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details. 2023/06/13 16:27:44 WARNING mlflow.pyfunc: Detected one or more mismatches between the model's dependencies and the current Python environment: - mlflow (current: 2.3.2, required: mlflow==2.3) To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file. 2023/06/13 16:27:44 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator. 2023/06/13 16:27:44 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
Model Loaded
2023/06/13 16:27:44 INFO mlflow.models.evaluation.default_evaluator: Shap explainer Tree is used. /Users/coolkid/anaconda3/lib/python3.10/site-packages/mlflow/shap.py:459: UserWarning: Unable to serialize underlying model using MLflow, will use SHAP serialization 2023/06/13 16:27:45 WARNING mlflow.models.evaluation.default_evaluator: Logging explainer failed. Reason: AttributeError("'TreeEnsemble' object has no attribute 'save'"). Set logging level to DEBUG to see the full traceback. /Users/coolkid/anaconda3/lib/python3.10/site-packages/shap/plots/_beeswarm.py:340: UserWarning: No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored /Users/coolkid/anaconda3/lib/python3.10/site-packages/shap/plots/_beeswarm.py:664: UserWarning: No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored
Metrics Saved
mlflow.set_tracking_uri("")
mlflow.set_experiment(experiment_name="BBBC_modeling")
with mlflow.start_run(run_name='b_log_catboost_final'):
params = {'iterations':[2,4],
'depth': [6,7,8,9],
'bagging_temperature': [0, 10, 50]}
model = CatBoostClassifier(learning_rate=1,
loss_function='Logloss',
verbose=False,
random_state=7)
m1 = myModel('Log CatBoost Model',model)
# make data
Xtrain,Xtest,ytrain,ytest = m1.make_data("full", "before", _)
print("Data Loaded")
print("Building Model...")
# log transform data
Xtrain, Xtest = m1.log_transform(['Frequency', 'First_purchase'])
log_catboost_model, log_catboost_scores = m1.fit_model(model, ['f1','accuracy', 'precision', 'recall'], tuning=True, params=params)
log_catboost_params = log_catboost_model.get_params()
#signature?
signature = infer_signature(Xtest, log_catboost_model.predict(Xtest))
#Build the Evaluation Dataset from the test set
eval_data = Xtest
eval_data["label"] = ytest
undersample = 0
oversample = 0
# log parameters
mlflow.log_param('iterations', log_catboost_params['iterations'])
mlflow.log_param('learning_rate', log_catboost_params['learning_rate'])
mlflow.log_param('depth', log_catboost_params['depth'])
mlflow.log_param('bagging_temperature', log_catboost_params['bagging_temperature'])
mlflow.log_param('loss_function', log_catboost_params['loss_function'])
# log metrics
mlflow.log_metric('accuracy', mean(log_catboost_scores['test_accuracy']))
mlflow.log_metric('f1', mean(log_catboost_scores['test_f1']))
mlflow.log_metric('precision', mean(log_catboost_scores['test_precision']))
mlflow.log_metric('recall', mean(log_catboost_scores['test_recall']))
mlflow.log_metric('oversample', oversample)
mlflow.log_metric('undersample', undersample)
# log model
mlflow.sklearn.log_model(log_catboost_model, "log_catboost model")
model_uri = mlflow.get_artifact_uri("log_catboost model")
print("Model Loaded")
# evaluate model
result = mlflow.evaluate(
model_uri,
eval_data,
targets="label",
model_type="classifier",
evaluators=["default"],
)
# save results in json
result.save("")
print("Metrics Saved")
mlflow.end_run()
Data Loaded
Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.
Average Test Accuracy Score: 0.8666666666666666
Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details. 2023/06/13 14:37:19 WARNING mlflow.pyfunc: Detected one or more mismatches between the model's dependencies and the current Python environment: - mlflow (current: 2.3.2, required: mlflow==2.3) To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file. 2023/06/13 14:37:19 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator. 2023/06/13 14:37:19 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0. 2023/06/13 14:37:19 WARNING mlflow.models.evaluation.default_evaluator: Computing sklearn model score failed: TypeError("CatBoostClassifier.score() got an unexpected keyword argument 'sample_weight'"). Set logging level to DEBUG to see the full traceback.
Model Loaded
2023/06/13 14:37:20 INFO mlflow.models.evaluation.default_evaluator: Shap explainer Tree is used. Unable to serialize underlying model using MLflow, will use SHAP serialization 2023/06/13 14:37:20 WARNING mlflow.models.evaluation.default_evaluator: Logging explainer failed. Reason: AttributeError("'TreeEnsemble' object has no attribute 'save'"). Set logging level to DEBUG to see the full traceback. No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored
importances = rf_model.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf_model.estimators_], axis=0)
feature_names = [i for i in rf_model.feature_names_in_]
forest_importances = pd.Series(importances, index=feature_names)
fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=std, ax=ax)
ax.set_title("Feature importances")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()
# significant improvement after slim trasnformation
mlflow.set_tracking_uri("")
mlflow.set_experiment(experiment_name="BBBC_modeling")
with mlflow.start_run(run_name='a_slim_log_logit_final'):
model = LogisticRegression(random_state=7, max_iter=1000)
m1 = myModel('Slim Log Logistic Regression',model)
# make data
Xtrain,Xtest,ytrain,ytest = m1.make_data("slim", "after", ['P_Art', 'Amount_purchased','Frequency', 'First_purchase'])
print("Data Loaded")
print("Building Model...")
# log transform data
Xtrain, Xtest = m1.log_transform(['Frequency', 'First_purchase'])
# make model
slim_log_logit_model,slim_log_logit_scores = m1.fit_model(model, ['f1','accuracy', 'precision', 'recall'], tuning=False)
#signature?
signature = infer_signature(Xtest, slim_log_logit_model.predict(Xtest))
#Build the Evaluation Dataset from the test set
eval_data = Xtest
eval_data["label"] = ytest
undersamp = .2
oversamp = .3
# log metrics
mlflow.log_metric('accuracy', mean(slim_log_logit_scores['test_accuracy']))
mlflow.log_metric('f1', mean(slim_log_logit_scores['test_f1']))
mlflow.log_metric('precision', mean(slim_log_logit_scores['test_precision']))
mlflow.log_metric('recall', mean(slim_log_logit_scores['test_recall']))
mlflow.log_metric('oversample', oversamp)
mlflow.log_metric('undersample', undersamp)
# log model
mlflow.sklearn.log_model(slim_log_logit_model, "slim_log_logit model after002003")
model_uri = mlflow.get_artifact_uri("slim_log_logit model after002003")
print("Model Loaded")
# evaluate model
result = mlflow.evaluate(
model_uri,
eval_data,
targets="label",
model_type="classifier",
evaluators=["default"],
)
# save results in json
result.save("")
print("Metrics Saved")
mlflow.end_run()
Data Loaded Building Model... Average Test Accuracy Score: 0.8846153846153847
/Users/coolkid/anaconda3/lib/python3.10/site-packages/mlflow/models/signature.py:137: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details. 2023/06/13 16:25:54 WARNING mlflow.pyfunc: Detected one or more mismatches between the model's dependencies and the current Python environment: - mlflow (current: 2.3.2, required: mlflow==2.3) To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file. 2023/06/13 16:25:54 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator. 2023/06/13 16:25:54 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
Model Loaded
2023/06/13 16:25:54 INFO mlflow.models.evaluation.default_evaluator: Shap explainer Linear is used. /Users/coolkid/anaconda3/lib/python3.10/site-packages/mlflow/shap.py:459: UserWarning: Unable to serialize underlying model using MLflow, will use SHAP serialization 2023/06/13 16:25:54 WARNING mlflow.models.evaluation.default_evaluator: Logging explainer failed. Reason: AttributeError("'LogisticRegression' object has no attribute 'save'"). Set logging level to DEBUG to see the full traceback. /Users/coolkid/anaconda3/lib/python3.10/site-packages/shap/plots/_beeswarm.py:340: UserWarning: No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored /Users/coolkid/anaconda3/lib/python3.10/site-packages/shap/plots/_beeswarm.py:664: UserWarning: No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored
Metrics Saved
# no significant improvment with slim transformation
mlflow.set_tracking_uri("")
mlflow.set_experiment(experiment_name="BBBC_modeling")
with mlflow.start_run(run_name='b_slim_rf_model_final'):
model = RandomForestClassifier(random_state=7)
m1 = myModel('Slim Random Forest',model)
# make data
'''
Random forest is an ideal algorithm to deal with the extreme imbalance owing to two main reasons.
Firstly, the ability to incorporate class weights into the random forest classifier makes it cost-sensitive;
hence it penalizes misclassifying the minority class.
'''
Xtrain,Xtest,ytrain,ytest = m1.make_data("slim", "before", ['P_Art', 'Amount_purchased','Frequency', 'First_purchase'])
print("Data Loaded")
print("Building Model...")
# make model
params = {'n_estimators': [25,50,100,200,500],
'max_depth': [2,5,8, 50]}
slim_rf_model, slim_rf_scores = m1.fit_model(model, ['f1','accuracy', 'precision', 'recall'], tuning=True, params=params)
#signature?
signature = infer_signature(Xtest, slim_rf_model.predict(Xtest))
#Build the Evaluation Dataset from the test set
eval_data = Xtest
eval_data["label"] = ytest
undersamp = 0
oversamp = 0
slim_rf_params = slim_rf_model.get_params()
# # log parameters
mlflow.log_param('n_estimators', slim_rf_params['n_estimators'])
mlflow.log_param('max_depth', slim_rf_params['max_depth'])
# log metrics
mlflow.log_metric('accuracy', mean(slim_rf_scores['test_accuracy']))
mlflow.log_metric('f1', mean(slim_rf_scores['test_f1']))
mlflow.log_metric('precision', mean(slim_rf_scores['test_precision']))
mlflow.log_metric('recall', mean(slim_rf_scores['test_recall']))
mlflow.log_metric('oversample', oversamp)
mlflow.log_metric('undersample', undersamp)
# log model
mlflow.sklearn.log_model(slim_rf_model, "slim_log rf model")
model_uri = mlflow.get_artifact_uri("slim_log rf model")
print("Model Loaded")
explainer = shap.Explainer(slim_rf_model, Xtest)
shap_values = explainer(Xtest)
shap_exp = shap_values
#evaluate model
result = mlflow.evaluate(
model_uri,
eval_data,
targets="label",
model_type="classifier",
evaluators="default",
evaluator_config= {"log_model_explainability":False} # error called when creating beeswarm plot,
)
# save results in json
result.save("")
print("Metrics Saved")
mlflow.end_run()
Data Loaded Average Test Accuracy Score: 0.8705128205128204
Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.
Model Loaded
99%|===================| 1538/1560 [00:18<00:00] 2023/06/13 14:38:56 WARNING mlflow.pyfunc: Detected one or more mismatches between the model's dependencies and the current Python environment: - mlflow (current: 2.3.2, required: mlflow==2.3) To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file. 2023/06/13 14:38:56 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator. 2023/06/13 14:38:56 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
<Figure size 1050x700 with 0 Axes>
# worse with slim transformation
mlflow.set_tracking_uri("")
mlflow.set_experiment(experiment_name="BBBC_modeling")
with mlflow.start_run(run_name='b_slim_xgb_final'):
model = xgb.XGBClassifier(random_state=7,
eta=.2,
max_depth=3,
gamma=1,
subsample=1)
m1 = myModel('Slim XGB Model',model)
# make data
Xtrain,Xtest,ytrain,ytest = m1.make_data("slim", "before", ['P_Art', 'Amount_purchased','Frequency', 'First_purchase'])
print("Data Loaded")
print("Building Model...")
# make model
params = {'eta' : [0.01, 0.2],
'max_depth': [3],
'gamma': [0 , .1],
'subsample': [0.5, 1]}
slim_tuned_xgb_model, slim_tuned_xgb_scores = m1.fit_model(model, ['f1','accuracy', 'precision', 'recall'], tuning=False, params=params)
#signature?
signature = infer_signature(Xtest, slim_tuned_xgb_model.predict(Xtest))
#Build the Evaluation Dataset from the test set
eval_data = Xtest
eval_data["label"] = ytest
undersample = 0
oversample = 0
slim_xgb_params = slim_tuned_xgb_model.get_params()
# log parameters
mlflow.log_param('eta', slim_xgb_params['eta'])
mlflow.log_param('max_depth', slim_xgb_params['max_depth'])
mlflow.log_param('gamma', slim_xgb_params['gamma'])
mlflow.log_param('subsample', slim_xgb_params['subsample'])
# log metrics
mlflow.log_metric('accuracy', mean(slim_tuned_xgb_scores['test_accuracy']))
mlflow.log_metric('f1', mean(slim_tuned_xgb_scores['test_f1']))
mlflow.log_metric('precision', mean(slim_tuned_xgb_scores['test_precision']))
mlflow.log_metric('recall', mean(slim_tuned_xgb_scores['test_recall']))
mlflow.log_metric('oversample', oversample)
mlflow.log_metric('undersample', undersample)
# log model
mlflow.sklearn.log_model(slim_tuned_xgb_model, "slim_xgb model")
model_uri = mlflow.get_artifact_uri("slim_xgb model")
print("Model Loaded")
# evaluate model
result = mlflow.evaluate(
model_uri,
eval_data,
targets="label",
model_type="classifier",
evaluators=["default"],
)
# save results in json
result.save("")
print("Metrics Saved")
mlflow.end_run()
Data Loaded Average Test Accuracy Score: 0.8807692307692306
Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details. 2023/06/13 14:38:59 WARNING mlflow.pyfunc: Detected one or more mismatches between the model's dependencies and the current Python environment: - mlflow (current: 2.3.2, required: mlflow==2.3) To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file. 2023/06/13 14:38:59 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator. 2023/06/13 14:38:59 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
Model Loaded
2023/06/13 14:39:00 INFO mlflow.models.evaluation.default_evaluator: Shap explainer Tree is used. Unable to serialize underlying model using MLflow, will use SHAP serialization 2023/06/13 14:39:00 WARNING mlflow.models.evaluation.default_evaluator: Logging explainer failed. Reason: AttributeError("'TreeEnsemble' object has no attribute 'save'"). Set logging level to DEBUG to see the full traceback. No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored
# worse with slim transformation
mlflow.set_tracking_uri("")
mlflow.set_experiment(experiment_name="BBBC_modeling")
with mlflow.start_run(run_name='b_slim_catboost_final'):
params = {'iterations':[2,4, 8, 20, 50],
'depth': [6,7,8,9]}
model = CatBoostClassifier(bagging_temperature=0,
learning_rate=1,
loss_function='Logloss',
verbose=False,
random_state=7)
m1 = myModel('Slim CatBoost Model',model)
# make data
Xtrain,Xtest,ytrain,ytest = m1.make_data("slim", "before", ['P_Art', 'Amount_purchased','Frequency', 'First_purchase'])
print("Data Loaded")
print("Building Model...")
slim_catboost_model, slim_catboost_scores = m1.fit_model(model, ['f1','accuracy', 'precision', 'recall'], tuning=True, params=params)
slim_catboost_params = slim_catboost_model.get_params()
#signature?
signature = infer_signature(Xtest, slim_catboost_model.predict(Xtest))
#Build the Evaluation Dataset from the test set
eval_data = Xtest
eval_data["label"] = ytest
undersample = 0
oversample = 0
# log parameters
mlflow.log_param('iterations', slim_catboost_params['iterations'])
mlflow.log_param('learning_rate', slim_catboost_params['learning_rate'])
mlflow.log_param('depth', slim_catboost_params['depth'])
mlflow.log_param('bagging_temperature', slim_catboost_params['bagging_temperature'])
mlflow.log_param('loss_function', slim_catboost_params['loss_function'])
# log metrics
mlflow.log_metric('accuracy', mean(slim_catboost_scores['test_accuracy']))
mlflow.log_metric('f1', mean(slim_catboost_scores['test_f1']))
mlflow.log_metric('precision', mean(slim_catboost_scores['test_precision']))
mlflow.log_metric('recall', mean(slim_catboost_scores['test_recall']))
mlflow.log_metric('oversample', oversample)
mlflow.log_metric('undersample', undersample)
# log model
mlflow.sklearn.log_model(slim_catboost_model, "slim_catboost model")
model_uri = mlflow.get_artifact_uri("slim_catboost model")
print("Model Loaded")
# evaluate model
result = mlflow.evaluate(
model_uri,
eval_data,
targets="label",
model_type="classifier",
evaluators=["default"],
)
# save results in json
result.save("")
print("Metrics Saved")
mlflow.end_run()
Data Loaded Average Test Accuracy Score: 0.873076923076923
Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details. 2023/06/13 14:39:11 WARNING mlflow.pyfunc: Detected one or more mismatches between the model's dependencies and the current Python environment: - mlflow (current: 2.3.2, required: mlflow==2.3) To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file. 2023/06/13 14:39:11 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator. 2023/06/13 14:39:11 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0. 2023/06/13 14:39:11 WARNING mlflow.models.evaluation.default_evaluator: Computing sklearn model score failed: TypeError("CatBoostClassifier.score() got an unexpected keyword argument 'sample_weight'"). Set logging level to DEBUG to see the full traceback.
Model Loaded
2023/06/13 14:39:12 INFO mlflow.models.evaluation.default_evaluator: Shap explainer Tree is used. Unable to serialize underlying model using MLflow, will use SHAP serialization 2023/06/13 14:39:12 WARNING mlflow.models.evaluation.default_evaluator: Logging explainer failed. Reason: AttributeError("'TreeEnsemble' object has no attribute 'save'"). Set logging level to DEBUG to see the full traceback. No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored
mlflow.set_tracking_uri("")
mlflow.set_experiment(experiment_name="BBBC_modeling")
with mlflow.start_run(run_name='ensv1_final'):
#ens v1
# estimators=[('logit', logit_model), ('rf_tuned', rf_model), ('xgb', tuned_xgb_model)]
# ens v2
estimators=[('logit', logit_model), ('rf_tuned', rf_model), ('xgb', tuned_xgb_model), ('tuned_sample_catboost', catboost_model)]
model = VotingClassifier(estimators, voting='hard')
m1 = myModel('EnsV1 Model',model)
# make data
Xtrain,Xtest,ytrain,ytest = m1.make_data("full", "before", _)
print("Data Loaded")
print("Building Model...")
ensemble_v1_model, ensemble_v1_scores = m1.fit_model(model, ['f1','accuracy', 'precision', 'recall'], tuning=False)
#signature?
signature = infer_signature(Xtest, ensemble_v1_model.predict(Xtest))
#Build the Evaluation Dataset from the test set
eval_data = Xtest
eval_data["label"] = ytest
undersample = 0
oversample = 0
models = {'modl1': 'logit',
'modl2': 'rf_tuned',
'modl3': 'xgb',
'modl4': 'tuned_sample_catboost'}
# log params
mlflow.log_params(models)
# log metrics
mlflow.log_metric('accuracy', mean(ensemble_v1_scores['test_accuracy']))
mlflow.log_metric('f1', mean(ensemble_v1_scores['test_f1']))
mlflow.log_metric('precision', mean(ensemble_v1_scores['test_precision']))
mlflow.log_metric('recall', mean(ensemble_v1_scores['test_recall']))
mlflow.log_metric('oversample', oversample)
mlflow.log_metric('undersample', undersample)
# log model
mlflow.sklearn.log_model(ensemble_v1_model, "ens1 model")
model_uri = mlflow.get_artifact_uri("ens1 model")
print("Model Loaded")
# evaluate model
result = mlflow.evaluate(
model_uri,
eval_data,
targets="label",
model_type="classifier",
evaluators=["default"],
)
# save results in json
result.save("")
print("Metrics Saved")
mlflow.end_run()
Data Loaded Building Model...
/Users/coolkid/anaconda3/lib/python3.10/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.
Average Test Accuracy Score: 0.8756410256410255
/Users/coolkid/anaconda3/lib/python3.10/site-packages/mlflow/models/signature.py:137: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details. 2023/06/13 16:30:05 WARNING mlflow.pyfunc: Detected one or more mismatches between the model's dependencies and the current Python environment: - mlflow (current: 2.3.2, required: mlflow==2.3) To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file. 2023/06/13 16:30:05 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator. 2023/06/13 16:30:05 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
Model Loaded
2023/06/13 16:30:05 WARNING mlflow.models.evaluation.default_evaluator: Shap evaluation failed. Reason: TypeError("The passed model is not callable and cannot be analyzed directly with the given masker! Model: VotingClassifier(estimators=[('logit',\n LogisticRegression(max_iter=1000,\n random_state=7)),\n ('rf_tuned',\n RandomForestClassifier(max_depth=8,\n n_estimators=200,\n random_state=7)),\n ('xgb',\n XGBClassifier(base_score=None, booster=None,\n callbacks=None,\n colsample_bylevel=None,\n colsample_bynode=None,\n colsample_bytree=None,\n early_stopping_rounds=None,\n enable_categorical=Fal...\n interaction_constraints=None,\n learning_rate=None, max_bin=None,\n max_cat_threshold=None,\n max_cat_to_onehot=None,\n max_delta_step=None, max_depth=3,\n max_leaves=None,\n min_child_weight=None, missing=nan,\n monotone_constraints=None,\n n_estimators=100, n_jobs=None,\n num_parallel_tree=None,\n predictor=None, ...)),\n ('tuned_sample_catboost',\n <catboost.core.CatBoostClassifier object at 0x17a8ae560>)])"). Set logging level to DEBUG to see the full traceback.
Metrics Saved
<Figure size 1050x700 with 0 Axes>
mlflow.set_tracking_uri("")
mlflow.set_experiment(experiment_name="BBBC_modeling")
with mlflow.start_run(run_name='ensv2_final'):
#ens v1
# estimators=[('logit', logit_model), ('rf_tuned', rf_model), ('xgb', tuned_xgb_model)]
# ens v2
estimators=[('log logit', log_logit_model),('logit', logit_model), ('rf_tuned', rf_model),
('xgb', tuned_xgb_model), ('tuned_sample_catboost', catboost_model)]
model = VotingClassifier(estimators, voting='hard')
m1 = myModel('EnsV2 Model',model)
# make data
Xtrain,Xtest,ytrain,ytest = m1.make_data("full", "before", _)
print("Data Loaded")
print("Building Model...")
ensemble_v2_model, ensemble_v2_scores = m1.fit_model(model, ['f1','accuracy', 'precision', 'recall'], tuning=False)
#signature?
signature = infer_signature(Xtest, ensemble_v2_model.predict(Xtest))
#Build the Evaluation Dataset from the test set
eval_data = Xtest
eval_data["label"] = ytest
undersample = 0
oversample = 0
models = {'modl1': 'logit',
'modl2': 'rf_tuned',
'modl3': 'xgb',
'modl4': 'tuned_sample_catboost',
'modl5': 'log_logit'}
# log params
mlflow.log_params(models)
# log metrics
mlflow.log_metric('accuracy', mean(ensemble_v2_scores['test_accuracy']))
mlflow.log_metric('f1', mean(ensemble_v2_scores['test_f1']))
mlflow.log_metric('precision', mean(ensemble_v2_scores['test_precision']))
mlflow.log_metric('recall', mean(ensemble_v2_scores['test_recall']))
mlflow.log_metric('oversample', oversample)
mlflow.log_metric('undersample', undersample)
# log model
mlflow.sklearn.log_model(ensemble_v2_model, "ens2 model")
model_uri = mlflow.get_artifact_uri("ens2 model")
print("Model Loaded")
# evaluate model
result = mlflow.evaluate(
model_uri,
eval_data,
targets="label",
model_type="classifier",
evaluators=["default"],
)
mlflow.end_run()
Data Loaded Average Test Accuracy Score: 0.8756410256410255
Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details. 2023/06/13 14:39:23 WARNING mlflow.pyfunc: Detected one or more mismatches between the model's dependencies and the current Python environment: - mlflow (current: 2.3.2, required: mlflow==2.3) To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file. 2023/06/13 14:39:23 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator. 2023/06/13 14:39:23 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
Model Loaded
2023/06/13 14:39:23 WARNING mlflow.models.evaluation.default_evaluator: Shap evaluation failed. Reason: TypeError("The passed model is not callable and cannot be analyzed directly with the given masker! Model: VotingClassifier(estimators=[('log logit',\n LogisticRegression(max_iter=1000,\n random_state=7)),\n ('logit',\n LogisticRegression(max_iter=1000,\n random_state=7)),\n ('rf_tuned',\n RandomForestClassifier(max_depth=8,\n n_estimators=200,\n random_state=7)),\n ('xgb',\n XGBClassifier(base_score=None, booster=None,\n callbacks=None,\n colsample_bylevel=None,\n colsample_bynode=None,\n colsample...\n interaction_constraints=None,\n learning_rate=None, max_bin=None,\n max_cat_threshold=None,\n max_cat_to_onehot=None,\n max_delta_step=None, max_depth=3,\n max_leaves=None,\n min_child_weight=None, missing=nan,\n monotone_constraints=None,\n n_estimators=100, n_jobs=None,\n num_parallel_tree=None,\n predictor=None, ...)),\n ('tuned_sample_catboost',\n <catboost.core.CatBoostClassifier object at 0x285186b60>)])"). Set logging level to DEBUG to see the full traceback.
<Figure size 1050x700 with 0 Axes>
mlflow.set_tracking_uri("")
mlflow.set_experiment(experiment_name="BBBC_modeling")
with mlflow.start_run(run_name='ensv3_final'):
#ens v1
# estimators=[('logit', logit_model), ('rf_tuned', rf_model), ('xgb', tuned_xgb_model)]
# ens v2
estimators=[('log logit', log_logit_model),('logit', logit_model), ('rf_tuned', rf_model),
('xgb', tuned_xgb_model), ('tuned_sample_catboost', catboost_model),
('slim log logit', slim_log_logit_model)]
model = VotingClassifier(estimators, voting='hard')
m1 = myModel('EnsV3 Model',model)
# make data
Xtrain,Xtest,ytrain,ytest = m1.make_data("full", "before", _)
print("Data Loaded")
print("Building Model...")
ensemble_v3_model, ensemble_v3_scores = m1.fit_model(model, ['f1','accuracy', 'precision', 'recall'], tuning=False)
#signature?
signature = infer_signature(Xtest, ensemble_v3_model.predict(Xtest))
#Build the Evaluation Dataset from the test set
eval_data = Xtest
eval_data["label"] = ytest
undersample = 0
oversample = 0
models = {'modl1': 'logit',
'modl2': 'rf_tuned',
'modl3': 'xgb',
'modl4': 'tuned_sample_catboost',
'modl5': 'log_logit',
'modl6': 'slim log logit'}
# log params
mlflow.log_params(models)
# log metrics
mlflow.log_metric('accuracy', mean(ensemble_v3_scores['test_accuracy']))
mlflow.log_metric('f1', mean(ensemble_v3_scores['test_f1']))
mlflow.log_metric('precision', mean(ensemble_v3_scores['test_precision']))
mlflow.log_metric('recall', mean(ensemble_v3_scores['test_recall']))
mlflow.log_metric('oversample', oversample)
mlflow.log_metric('undersample', undersample)
# log model
mlflow.sklearn.log_model(ensemble_v3_model, "ens3 model")
model_uri = mlflow.get_artifact_uri("ens3 model")
print("Model Loaded")
# evaluate model
result = mlflow.evaluate(
model_uri,
eval_data,
targets="label",
model_type="classifier",
evaluators=["default"],
)
mlflow.end_run()
Data Loaded Average Test Accuracy Score: 0.8794871794871796
/Users/coolkid/anaconda3/lib/python3.10/site-packages/mlflow/models/signature.py:137: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details. 2023/06/13 15:33:16 WARNING mlflow.pyfunc: Detected one or more mismatches between the model's dependencies and the current Python environment: - mlflow (current: 2.3.2, required: mlflow==2.3) To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file. 2023/06/13 15:33:16 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator. 2023/06/13 15:33:16 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
Model Loaded
2023/06/13 15:33:16 WARNING mlflow.models.evaluation.default_evaluator: Shap evaluation failed. Reason: TypeError("The passed model is not callable and cannot be analyzed directly with the given masker! Model: VotingClassifier(estimators=[('log logit',\n LogisticRegression(max_iter=1000,\n random_state=7)),\n ('logit',\n LogisticRegression(max_iter=1000,\n random_state=7)),\n ('rf_tuned',\n RandomForestClassifier(max_depth=8,\n n_estimators=200,\n random_state=7)),\n ('xgb',\n XGBClassifier(base_score=None, booster=None,\n callbacks=None,\n colsample_bylevel=None,\n colsample_bynode=None,\n colsample...\n max_cat_to_onehot=None,\n max_delta_step=None, max_depth=3,\n max_leaves=None,\n min_child_weight=None, missing=nan,\n monotone_constraints=None,\n n_estimators=100, n_jobs=None,\n num_parallel_tree=None,\n predictor=None, ...)),\n ('tuned_sample_catboost',\n <catboost.core.CatBoostClassifier object at 0x28698aa40>),\n ('slim log logit',\n LogisticRegression(max_iter=1000,\n random_state=7))])"). Set logging level to DEBUG to see the full traceback.
<Figure size 1050x700 with 0 Axes>
mlflow.set_tracking_uri("")
mlflow.set_experiment(experiment_name="BBBC_modeling")
with mlflow.start_run(run_name='ensv4_final'):
#ens v1
# estimators=[('logit', logit_model), ('rf_tuned', rf_model), ('xgb', tuned_xgb_model)]
# ens v2
estimators=[('log logit', log_logit_model),('logit', logit_model), ('rf_tuned', rf_model),
('xgb', tuned_xgb_model), ('tuned_sample_catboost', catboost_model),
('slim log logit', slim_log_logit_model), ('log_xgb', log_tuned_xgb_model)]
model = VotingClassifier(estimators, voting='hard')
m1 = myModel('EnsV4 Model',model)
# make data
Xtrain,Xtest,ytrain,ytest = m1.make_data("full", "before", _)
print("Data Loaded")
print("Building Model...")
ensemble_v4_model, ensemble_v4_scores = m1.fit_model(model, ['f1','accuracy', 'precision', 'recall'], tuning=False)
#signature?
signature = infer_signature(Xtest, ensemble_v4_model.predict(Xtest))
#Build the Evaluation Dataset from the test set
eval_data = Xtest
eval_data["label"] = ytest
undersample = 0
oversample = 0
models = {'modl1': 'logit',
'modl2': 'rf_tuned',
'modl3': 'xgb',
'modl4': 'tuned_sample_catboost',
'modl5': 'log_logit',
'modl6': 'slim log logit',
'modl7': 'log_xgb'}
# log params
mlflow.log_params(models)
# log metrics
mlflow.log_metric('accuracy', mean(ensemble_v4_scores['test_accuracy']))
mlflow.log_metric('f1', mean(ensemble_v4_scores['test_f1']))
mlflow.log_metric('precision', mean(ensemble_v4_scores['test_precision']))
mlflow.log_metric('recall', mean(ensemble_v4_scores['test_recall']))
mlflow.log_metric('oversample', oversample)
mlflow.log_metric('undersample', undersample)
# log model
mlflow.sklearn.log_model(ensemble_v4_model, "ens4 model")
model_uri = mlflow.get_artifact_uri("ens4 model")
print("Model Loaded")
# evaluate model
result = mlflow.evaluate(
model_uri,
eval_data,
targets="label",
model_type="classifier",
evaluators=["default"],
)
# save results in json
result.save("")
print("Metrics Saved")
mlflow.end_run()
Data Loaded Building Model... Average Test Accuracy Score: 0.8756410256410255
/Users/coolkid/anaconda3/lib/python3.10/site-packages/mlflow/models/signature.py:137: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details. 2023/06/13 16:32:35 WARNING mlflow.pyfunc: Detected one or more mismatches between the model's dependencies and the current Python environment: - mlflow (current: 2.3.2, required: mlflow==2.3) To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file. 2023/06/13 16:32:35 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator. 2023/06/13 16:32:35 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
Model Loaded
2023/06/13 16:32:36 WARNING mlflow.models.evaluation.default_evaluator: Shap evaluation failed. Reason: TypeError("The passed model is not callable and cannot be analyzed directly with the given masker! Model: VotingClassifier(estimators=[('log logit',\n LogisticRegression(max_iter=1000,\n random_state=7)),\n ('logit',\n LogisticRegression(max_iter=1000,\n random_state=7)),\n ('rf_tuned',\n RandomForestClassifier(max_depth=8,\n n_estimators=200,\n random_state=7)),\n ('xgb',\n XGBClassifier(base_score=None, booster=None,\n callbacks=None,\n colsample_bylevel=None,\n colsample_bynode=None,\n colsample...\n feature_types=None, gamma=1,\n gpu_id=None, grow_policy=None,\n importance_type=None,\n interaction_constraints=None,\n learning_rate=None, max_bin=None,\n max_cat_threshold=None,\n max_cat_to_onehot=None,\n max_delta_step=None, max_depth=3,\n max_leaves=None,\n min_child_weight=None, missing=nan,\n monotone_constraints=None,\n n_estimators=100, n_jobs=None,\n num_parallel_tree=None,\n predictor=None, ...))])"). Set logging level to DEBUG to see the full traceback.
Metrics Saved
<Figure size 1050x700 with 0 Axes>
# Loading json files from mlflow evaluation
logit_results = json_to_df("logit_metrics.json", "Logistic Regression")
log_logit_results = json_to_df("log_logit_metrics.json", "Log Logistic Regression")
slim_log_logit_results = json_to_df("slim_log_logit_metrics.json", "Slim Log Logistic Regression")
rf_results = json_to_df("rf_metrics.json", "Random Forest")
tuned_xgb_results = json_to_df("xgb_metrics.json", "XGB")
catboost_results = json_to_df("catboost_metrics.json", "CatBoost")
log_tuned_xgb_results = json_to_df("log_xgb_metrics.json", "Log XGB")
ensv1_results = json_to_df("ensv1_metrics.json", "EnsV1")
ensv4_results = json_to_df("ensv4_metrics.json", "EnsV4")
# '''
# TODO concatenate all data frames together
'''
visualize:
1. Precision on 1 plus others (recall/f1?) for each model
2. ROC curve for each model
3. Test Accuracy for each model
4. preds for 1 for each model
'''
data_df = pd.concat([logit_results,log_logit_results,slim_log_logit_results,
rf_results,tuned_xgb_results,catboost_results,log_tuned_xgb_results,
ensv1_results, ensv4_results])
data_df.drop_duplicates(inplace=True)
data_df
score | true_negatives | false_positives | false_negatives | true_positives | example_count | accuracy_score | recall_score | precision_score | f1_score | log_loss | roc_auc | precision_recall_auc | Model | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 0.865385 | 638 | 39 | 66 | 37 | 780 | 0.865385 | 0.359223 | 0.486842 | 0.413408 | 0.341163 | 0.782292 | 0.439211 | Logistic Regression |
1 | 0.873077 | 644 | 33 | 66 | 37 | 780 | 0.873077 | 0.359223 | 0.528571 | 0.427746 | 0.337587 | 0.785131 | 0.440876 | Log Logistic Regression |
1 | 0.871795 | 652 | 25 | 75 | 28 | 780 | 0.871795 | 0.271845 | 0.528302 | 0.358974 | 0.365266 | 0.735842 | 0.400491 | Slim Log Logistic Regression |
1 | 0.879487 | 665 | 12 | 82 | 21 | 780 | 0.879487 | 0.203883 | 0.636364 | 0.308824 | 0.335726 | 0.751846 | 0.416268 | Random Forest |
1 | 0.871795 | 657 | 20 | 80 | 23 | 780 | 0.871795 | 0.223301 | 0.534884 | 0.315068 | 0.337559 | 0.756055 | 0.402883 | XGB |
1 | NaN | 656 | 21 | 81 | 22 | 780 | 0.869231 | 0.213592 | 0.511628 | 0.301370 | 0.346526 | 0.734128 | 0.379118 | CatBoost |
1 | 0.871795 | 657 | 20 | 80 | 23 | 780 | 0.871795 | 0.223301 | 0.534884 | 0.315068 | 0.337559 | 0.756055 | 0.402883 | Log XGB |
1 | 0.888462 | 670 | 7 | 80 | 23 | 780 | 0.888462 | 0.223301 | 0.766667 | 0.345865 | NaN | NaN | NaN | EnsV1 |
1 | 0.891026 | 667 | 10 | 75 | 28 | 780 | 0.891026 | 0.271845 | 0.736842 | 0.397163 | NaN | NaN | NaN | EnsV4 |
# pivoting data for easier reporting
viz_df = data_df
viz_df = viz_df[['Model', 'accuracy_score', 'precision_score', 'f1_score', 'recall_score']]
viz_df = viz_df.sort_values('accuracy_score', ascending=False)
viz_df.plot(x="Model",
kind="bar",
stacked=False,
title = "Top Model Metric Comparison")
<Axes: title={'center': 'Top Model Metric Comparison'}, xlabel='Model'>
Apolgies for the legend in the way! I originally plotted this using plotLy but the chart did not render in html and I wanted to post this today!!
Greater detail about the metrics can be found below. The purpose of this plot is to highlight the high accuracy achieved by our ensembling approaches and the performance improvement after log transforming and applying feature selection to our baseline model (Logistic Regression)
get_profit(data_df)
/var/folders/x8/mllw6tg55fs6mvzyfxq6kcwm0000gn/T/ipykernel_22052/1118740994.py:185: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy /var/folders/x8/mllw6tg55fs6mvzyfxq6kcwm0000gn/T/ipykernel_22052/1118740994.py:186: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy /var/folders/x8/mllw6tg55fs6mvzyfxq6kcwm0000gn/T/ipykernel_22052/1118740994.py:187: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
score | true_negatives | false_positives | false_negatives | true_positives | example_count | accuracy_score | recall_score | precision_score | f1_score | log_loss | roc_auc | precision_recall_auc | Model | Profit | No Model Profit | Profit Boost From Model | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 0.865385 | 638 | 39 | 66 | 37 | 780 | 0.865385 | 0.359223 | 0.486842 | 0.413408 | 0.341163 | 0.782292 | 0.439211 | Logistic Regression | 877.7 | 373.65 | 134.898970 |
1 | 0.873077 | 644 | 33 | 66 | 37 | 780 | 0.873077 | 0.359223 | 0.528571 | 0.427746 | 0.337587 | 0.785131 | 0.440876 | Log Logistic Regression | 883.7 | 373.65 | 136.504750 |
1 | 0.871795 | 652 | 25 | 75 | 28 | 780 | 0.871795 | 0.271845 | 0.528302 | 0.358974 | 0.365266 | 0.735842 | 0.400491 | Slim Log Logistic Regression | 891.7 | 373.65 | 138.645792 |
1 | 0.879487 | 665 | 12 | 82 | 21 | 780 | 0.879487 | 0.203883 | 0.636364 | 0.308824 | 0.335726 | 0.751846 | 0.416268 | Random Forest | 904.7 | 373.65 | 142.124983 |
1 | 0.871795 | 657 | 20 | 80 | 23 | 780 | 0.871795 | 0.223301 | 0.534884 | 0.315068 | 0.337559 | 0.756055 | 0.402883 | XGB | 896.7 | 373.65 | 139.983942 |
1 | NaN | 656 | 21 | 81 | 22 | 780 | 0.869231 | 0.213592 | 0.511628 | 0.301370 | 0.346526 | 0.734128 | 0.379118 | CatBoost | 895.7 | 373.65 | 139.716312 |
1 | 0.871795 | 657 | 20 | 80 | 23 | 780 | 0.871795 | 0.223301 | 0.534884 | 0.315068 | 0.337559 | 0.756055 | 0.402883 | Log XGB | 896.7 | 373.65 | 139.983942 |
1 | 0.888462 | 670 | 7 | 80 | 23 | 780 | 0.888462 | 0.223301 | 0.766667 | 0.345865 | NaN | NaN | NaN | EnsV1 | 909.7 | 373.65 | 143.463134 |
1 | 0.891026 | 667 | 10 | 75 | 28 | 780 | 0.891026 | 0.271845 | 0.736842 | 0.397163 | NaN | NaN | NaN | EnsV4 | 906.7 | 373.65 | 142.660244 |
# ensv1 has greater precision on minority class
We stand to make a substantial profit from the implementation of Machine Learning in our direct amrketing campaigns. Opposed to earning $373.65
with no model the company can earn $909.70
using our optimal model! This is 143% profit boost and bring even greater margins for the company's 50K subscribers!
The optimal model was Version 1 Ensembling model. This model acheived an accuracy score of 88%
and precision of 76%
. The model with the highest accuracy was our Version 4 Ensemling Model but this model had a lower precision and is an example for why metrics should be understood. Precison equals how well a model is prediciting true values out of all true values. We wanted a model that best predicts Purchase
and has the lowest number of false positives. Our EnsV1
Model acheives this and would be my recommendation for the company.
The most important features were somewhat obvious when you think of the business use case. Focus on Customers who:
Observing the sumary output of a Logistic Regression model is great when there are statistically significant features and give greate insight into which direction that feature is causing the odds of Purchase
to increase or decrease. In our slim model 1 feature could be considered statistically significant after log transforming and resampling the data. This could be experimented with and tested on but considering the Logistic models odds ratios the company should consider customers who:
First_purchase
Frequency
#No Values Were Significant so did not include in the analysis
# make data
Xtrain,Xtest,ytrain,ytest = m1.make_data("slim", "after", ['P_Art', 'Amount_purchased','Frequency', 'First_purchase'])
# Xtrain,Xtest,ytrain,ytest = m1.make_data("full", "before", _)
# log transform data
Xtrain, Xtest = m1.log_transform(['Frequency', 'First_purchase'])
print_model = sm.Logit(ytrain,Xtrain).fit()
print(print_model.summary())
# convert log odds inot more interpretabl odds
model_odds = pd.DataFrame(np.exp(print_model.params), columns=['OR'])
model_odds
Optimization terminated successfully. Current function value: 0.466430 Iterations 6 Logit Regression Results ============================================================================== Dep. Variable: Choice No. Observations: 3256 Model: Logit Df Residuals: 3252 Method: MLE Df Model: 3 Date: Tue, 13 Jun 2023 Pseudo R-squ.: 0.1363 Time: 18:43:07 Log-Likelihood: -1518.7 converged: True LL-Null: -1758.4 Covariance Type: nonrobust LLR p-value: 1.327e-103 ==================================================================================== coef std err z P>|z| [0.025 0.975] ------------------------------------------------------------------------------------ P_Art 0.9561 0.069 13.833 0.000 0.821 1.092 Amount_purchased 0.0020 0.000 4.155 0.000 0.001 0.003 Frequency -0.6888 0.075 -9.168 0.000 -0.836 -0.542 First_purchase -0.2104 0.073 -2.877 0.004 -0.354 -0.067 ====================================================================================
OR | |
---|---|
P_Art | 2.601529 |
Amount_purchased | 1.001957 |
Frequency | 0.502203 |
First_purchase | 0.810282 |
Now lets use this optimal model to create a Marketing List that gives our company an actioanble item they can use!
Now let's select some observations that will be likely to purchase our material. This could be used to generate leads for marketers in the company.
Xtrain,Xtest,ytrain,ytest = m1.make_data("full", "before", _)
preds = ensemble_v4_model.predict(Xtest)
df_preds = pd.DataFrame(ytest)
df_preds['ensv4']= preds
df_preds.reset_index(inplace=True)
df_preds.rename(columns={'index':'CustomerId'}, inplace=True) # if we were given customers ids in a real world situation this would be the index.
df_preds
CustomerId | Choice | ensv4 | |
---|---|---|---|
0 | 284 | 1 | 0 |
1 | 244 | 1 | 0 |
2 | 1134 | 0 | 0 |
3 | 440 | 0 | 0 |
4 | 820 | 0 | 0 |
... | ... | ... | ... |
775 | 1179 | 0 | 0 |
776 | 577 | 0 | 0 |
777 | 1309 | 0 | 0 |
778 | 760 | 0 | 0 |
779 | 1277 | 0 | 0 |
780 rows × 3 columns
test = df_preds[df_preds['Choice']==1]
test_final = test[test['ensv4']==1]
test_final
CustomerId | Choice | ensv4 | |
---|---|---|---|
39 | 110 | 1 | 1 |
51 | 157 | 1 | 1 |
131 | 141 | 1 | 1 |
150 | 95 | 1 | 1 |
154 | 194 | 1 | 1 |
237 | 186 | 1 | 1 |
256 | 353 | 1 | 1 |
280 | 76 | 1 | 1 |
316 | 200 | 1 | 1 |
347 | 234 | 1 | 1 |
377 | 29 | 1 | 1 |
401 | 254 | 1 | 1 |
409 | 125 | 1 | 1 |
426 | 395 | 1 | 1 |
438 | 96 | 1 | 1 |
446 | 151 | 1 | 1 |
469 | 335 | 1 | 1 |
490 | 118 | 1 | 1 |
506 | 227 | 1 | 1 |
546 | 365 | 1 | 1 |
548 | 193 | 1 | 1 |
554 | 123 | 1 | 1 |
564 | 178 | 1 | 1 |
596 | 343 | 1 | 1 |
597 | 237 | 1 | 1 |
710 | 38 | 1 | 1 |
742 | 33 | 1 | 1 |
750 | 99 | 1 | 1 |
Now we have 28 customers we can reach out to with a high likelihood they will purchase. This was an inetresting topic to experiment with and I look forward to utilizing this use case again in the future!