keras-tensorflow code for Telecom Customer churn modelling
# -*- coding: utf-8 -*-
"""
Created on Sat Nov 28 23:43:13 2020
@author: Rajeev
"""
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0" #force cuda
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.utils import class_weight #to infer weights
import time
from focal_loss import BinaryFocalLoss #a loss function that trains only on badly classified ones
train_csv = pd.read_csv("customer_churn_train.csv", header="infer")
train_x= train_csv.iloc[:,1:20]
train_y= train_csv.loc[:,"Churn?"]
# compute the number of labels
num_labels = 2 # could be inferred from train_y, bad practice alert!
#compute weights to be used for adaptation of loss function for unbalanced data based on frequency of classes.
class_weights = class_weight.compute_class_weight('balanced', np.unique(train_y),train_y)
class_weights = dict(enumerate(class_weights))
#test data
test_csv = pd.read_csv("customer_churn_train.csv",header="infer") # never used, so do modify code to save and predict model.
test_x= train_csv.iloc[:,1:20]
test_y= train_csv.loc[:,"Churn?"]
#Change Churn to Numeric binary lables and change other boolean types too
train_y.replace(to_replace="True.",value="1",inplace=True)
train_y.replace(to_replace="False.",value="0",inplace=True)
train_y = to_categorical(train_y) #took me time to figure out why my code was vitriolic. This is necesssary!
train_x.replace(to_replace="yes",value="1",inplace=True)
train_x.replace(to_replace="no",value="0",inplace=True)
#drop phone no column. Add-on exercise, impute missing values or derive features from these columns
train_x.drop(columns='Phone',inplace=True)
train_x.drop(columns="Int'l Plan",inplace=True)
train_x.drop(columns="VMail Plan",inplace=True)
(rows,input_size) = train_x.shape #input layer shape is no of columns
# #model parameters
batch_size = 10 #smaller batch sizes seem to improve accuracy . 100 + reduces time but lowers it!
hidden_units = input_size*4 #int(math.sqrt(input_size))
dropout =0.4 # this rate can be changed per hidden layer as a parameter but this seems to work quite well...as
# in many other examples!
# model is a 3-layer MLP with ReLU and dropout after each layer
model = Sequential()
#layer 1
model.add(Dense(hidden_units,input_dim= input_size))
model.add(Activation('relu'))
model.add(Dropout(dropout))
#layer 2
model.add(Dense(hidden_units))
model.add(Activation('relu'))
model.add(Dropout(dropout))
#layer 3
model.add(Dense(hidden_units))
model.add(Activation('relu'))
model.add(Dropout(dropout))
#layer 4
model.add(Dense(hidden_units))
model.add(Activation('relu'))
model.add(Dropout(dropout))
#layer 5
model.add(Dense(hidden_units))
model.add(Activation('relu'))
model.add(Dropout(dropout))
#label layer
model.add(Dense(num_labels))
# crisp prediction
model.add(Activation('softmax'))
model.summary() # there are several other functions beside relu. I tried tanh . try others.
"""
It would be better in most advanced use cases to build a model using the functional API as network graphs are architecturally elegant with endess configuration possibilities plus being easily modifiable and shareable.
"""
# # use other loss functions and optimizers and see variation in results
# #
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
#option 2 is to use custom loss function that only works on hard to classify examples.
#model.compile(loss=BinaryFocalLoss(gamma=2), class_weight=class_weights, metrics=["accuracy"], optimizer='adam')
#fit model and evaluate.
model.fit(train_x, train_y, epochs=200, class_weight=class_weights,validation_split= 0.2, batch_size=batch_size)
print(time.perf_counter())
"""
The brief experiment shows good training and validation accuracy for a simple loss function with adam optimizer. There's a 2-3 percent uptick on using weights that are proportional to class frequency. Suprisingly batch size makes a difference too! No of neurons in a layer and no of layers is still an art!
"""