keras-tensorflow code for Telecom Customer churn modelling

# -*- coding: utf-8 -*-

"""

Created on Sat Nov 28 23:43:13 2020


@author: Rajeev



"""

import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0" #force cuda

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import pandas as pd

import numpy as np

from tensorflow.keras.models import Sequential

from tensorflow.keras.layers import Dense, Activation, Dropout

from tensorflow.keras.utils import to_categorical


from sklearn.utils import class_weight #to infer weights

import time


from focal_loss import BinaryFocalLoss #a loss function that trains only on badly classified ones

train_csv = pd.read_csv("customer_churn_train.csv", header="infer")

train_x= train_csv.iloc[:,1:20]

train_y= train_csv.loc[:,"Churn?"]




# compute the number of labels

num_labels = 2 # could be inferred from train_y, bad practice alert!


#compute weights to be used for adaptation of loss function for unbalanced data based on frequency of classes.


class_weights = class_weight.compute_class_weight('balanced', np.unique(train_y),train_y)

class_weights = dict(enumerate(class_weights))

#test data


test_csv = pd.read_csv("customer_churn_train.csv",header="infer") # never used, so do modify code to save and predict model.

test_x= train_csv.iloc[:,1:20]

test_y= train_csv.loc[:,"Churn?"]


#Change Churn to Numeric binary lables and change other boolean types too



train_y.replace(to_replace="True.",value="1",inplace=True)


train_y.replace(to_replace="False.",value="0",inplace=True)


train_y = to_categorical(train_y) #took me time to figure out why my code was vitriolic. This is necesssary!

train_x.replace(to_replace="yes",value="1",inplace=True)


train_x.replace(to_replace="no",value="0",inplace=True)



#drop phone no column. Add-on exercise, impute missing values or derive features from these columns

train_x.drop(columns='Phone',inplace=True)


train_x.drop(columns="Int'l Plan",inplace=True)

train_x.drop(columns="VMail Plan",inplace=True)


(rows,input_size) = train_x.shape #input layer shape is no of columns



# #model parameters

batch_size = 10 #smaller batch sizes seem to improve accuracy . 100 + reduces time but lowers it!

hidden_units = input_size*4 #int(math.sqrt(input_size))

dropout =0.4 # this rate can be changed per hidden layer as a parameter but this seems to work quite well...as

# in many other examples!

# model is a 3-layer MLP with ReLU and dropout after each layer


model = Sequential()

#layer 1

model.add(Dense(hidden_units,input_dim= input_size))

model.add(Activation('relu'))


model.add(Dropout(dropout))


#layer 2

model.add(Dense(hidden_units))

model.add(Activation('relu'))

model.add(Dropout(dropout))


#layer 3

model.add(Dense(hidden_units))

model.add(Activation('relu'))


model.add(Dropout(dropout))



#layer 4

model.add(Dense(hidden_units))

model.add(Activation('relu'))


model.add(Dropout(dropout))



#layer 5

model.add(Dense(hidden_units))

model.add(Activation('relu'))



model.add(Dropout(dropout))



#label layer

model.add(Dense(num_labels))

# crisp prediction

model.add(Activation('softmax'))


model.summary() # there are several other functions beside relu. I tried tanh . try others.

"""

It would be better in most advanced use cases to build a model using the functional API as network graphs are architecturally elegant with endess configuration possibilities plus being easily modifiable and shareable.

"""

# # use other loss functions and optimizers and see variation in results

# # 

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


#option 2 is to use custom loss function that only works on hard to classify examples. 

#model.compile(loss=BinaryFocalLoss(gamma=2), class_weight=class_weights, metrics=["accuracy"], optimizer='adam')


#fit model and evaluate.


model.fit(train_x, train_y, epochs=200, class_weight=class_weights,validation_split= 0.2, batch_size=batch_size)


print(time.perf_counter())


"""

The brief experiment shows good training and validation accuracy for a simple loss function with adam optimizer. There's a 2-3 percent uptick on using weights that are proportional to class frequency. Suprisingly batch size makes a difference too! No of neurons in a layer and no of layers is still an art!

"""

要查看或添加评论,请登录

Rajeev Gangal的更多文章

社区洞察

其他会员也浏览了