R-programming on supervised learning dataset - HR dataset
HRdataset<-read.csv(file.choose(),header=T)
#Eliminating the Role column & Salary column in the HRdataset
HRdataset$role<-NULL
HRdataset$salary<-NULL
#Creating a a duplicate dataset and naming it as HRdatasetnew
HRdatasetNew<-HRdataset
#Analysing the data
summary(HRdatasetNew)
#Install necessary Libraries
library(caret)
library(corrplot)
scatterplot(HRdatasetNew$average_montly_hours~HRdatasetNew$left)
boxplot(HRdatasetNew$left,HRdatasetNew$average_montly_hours)
#Correlatng the variables
COR<-cor(HRdatasetNew[,c(1:10)])
corrplot(COR)
#Split the dataset into train and test
SampleSplit<-createDataPartition(HRdatasetNew$left,p=0.70,list=FALSE)
Trainsplit<-HRdatasetNew[SampleSplit,]
TestSplit<-HRdatasetNew[-SampleSplit,]
# Framing a Logistic regression model
Logitmod<-glm(Trainsplit$left~.,family="binomial",data=Trainsplit)
summary(Logitmod)
library(car)
Trainsplit$score<-predict(Logitmod,newdata = Trainsplit,type="response")
TestSplit$score<-predict(Logitmod,newdata = TestSplit,type="response")
head(Trainsplit$score)
#Filtering the score variable of Trainsplit data into two parts based on 60% criteria.
Trainsplit$score_new<-ifelse(Trainsplit$score>=0.6,1,0)
#Filtering the score variable of Testsplit data into two parts based on 50% criteria.
TestSplit$score_new<-ifelse(TestSplit$score>=0.5,1,0)
head(Trainsplit$score_new)
tail(Trainsplit$score_new)
#Confusion matrix
ConfusionMatrix1<-xtabs(~Trainsplit$score_new+Trainsplit$left)
confusionMatrix2<-xtabs(~TestSplit$score_new+TestSplit$left)
head(TestSplit$score_new)
tail(TestSplit$score_new)
#Install libraries
library(pscl)
pR2(Logitmod)
library(InformationValue)
#Finding the concordance statistic
concor<-Concordance(Trainsplit$left,Trainsplit$score)
#RandomForest
library(randomForest)
library(caret)
modelRF<-randomForest(as.factor(left)~.,data=Trainsplit,do.trace=TRUE)
modelRF$err.rate
importance(modelRF)
varImpPlot(modelRF)
#Predict Random Forest model and Confusion Matrix
Trainsplit$Randomforest<-predict(modelRF, Trainsplit)
TestSplit$Randomforest<-predict(modelRF, TestSplit)
ConfusionMatrix3<-xtabs(~Trainsplit$Randomforest+Trainsplit$left)
ConfusionMatrix4<-xtabs(~TestSplit$Randomforest+TestSplit$left)
#ROC curve plots - Receiver operating characteristics
library(InformationValue)
plotROC(actuals = Trainsplit$left,predicted = as.numeric(fitted(Logitmod)))
ks_plot(actuals = Trainsplit$left,predicted = as.numeric(fitted(Logitmod)))
ks_stat(actuals = Trainsplit$left,predicted = as.numeric(fitted(Logitmod)))
#Decision Tree using Cart algorithm
library(rpart)
library(rpart.plot)
fit1<-rpart(left~.,data=Trainsplit,control=rpart.control(minsplit=30,cp=0.001))
summary(fit1)
rpart.plot(fit1)
prp(fit1)
plotcp(fit1)
fit1$cptable
#ConfusionMatrix of Decision Tree
Trainsplit$cartscore<-predict(fit1,newdata = Trainsplit)
TestSplit$cartscore<-predict(fit1,newdata = TestSplit)
#Filtering the score variable of Trainsplit data into two parts based on 50% criteria.
Trainsplit$cartscore_new<-ifelse(Trainsplit$score>0.5,1,0)
TestSplit$cartscore_new<-ifelse(TestSplit$score>0.5,1,0)
#ConfusionMatrix
confusionMatrix(Trainsplit$left,Trainsplit$cartscore_new)
confusionMatrix(TestSplit$left,TestSplit$cartscore_new)
#Another method for doing confusion Matrix
xtabs(~Trainsplit$cartscore_new+Trainsplit$left)
xtabs(~TestSplit$cartscore_new+TestSplit$left)
#Decision Tree Naivebayes algorithm
library(e1071)
BayesTrain<-naiveBayes(as.factor(left)~.,data=Trainsplit)
summary(BayesTrain)
Trainsplit$ScoreBayes<-predict(BayesTrain,Trainsplit)
TestSplit$ScoreBayes<-predict(BayesTrain,TestSplit)
#ConfusionMatrix of Decision Tree Naivebayes
ConfMatrix7<-xtabs(~Trainsplit$ScoreBayes+ Trainsplit$left)
ConfMatrix8<-xtabs(~TestSplit$ScoreBayes+ TestSplit$left)
#KNN (K nearest neighbourhood) and Crosstables
library(dummies)
dummy_df<-dummy.data.frame(HRdatasetNew[,c('role_code','salary.code')])
HRdataset1<-HRdatasetNew
HRdataset1<-cbind.data.frame(HRdataset1,dummy_df)
HRdataset1$role_code<-NULL
HRdataset1$salary.code<-NULL
HRdataset1$Work_accident<-as.numeric(HRdataset1$Work_accident)
HRdataset1$promotion_last_5years<-as.numeric(HRdataset1$promotion_last_5years)