R-programming on supervised learning dataset - HR dataset

HRdataset<-read.csv(file.choose(),header=T)


#Eliminating the Role column & Salary column in the HRdataset

HRdataset$role<-NULL

HRdataset$salary<-NULL


#Creating a a duplicate dataset and naming it as HRdatasetnew

HRdatasetNew<-HRdataset


#Analysing the data

summary(HRdatasetNew)

#Install necessary Libraries

library(caret)

library(corrplot)

scatterplot(HRdatasetNew$average_montly_hours~HRdatasetNew$left)

boxplot(HRdatasetNew$left,HRdatasetNew$average_montly_hours)


#Correlatng the variables 

COR<-cor(HRdatasetNew[,c(1:10)])

corrplot(COR)


#Split the dataset into train and test

SampleSplit<-createDataPartition(HRdatasetNew$left,p=0.70,list=FALSE)

Trainsplit<-HRdatasetNew[SampleSplit,]

TestSplit<-HRdatasetNew[-SampleSplit,]


# Framing a Logistic regression model 

Logitmod<-glm(Trainsplit$left~.,family="binomial",data=Trainsplit)

summary(Logitmod)

library(car)

Trainsplit$score<-predict(Logitmod,newdata = Trainsplit,type="response")

TestSplit$score<-predict(Logitmod,newdata = TestSplit,type="response")

head(Trainsplit$score)


#Filtering the score variable of Trainsplit data into two parts based on 60% criteria.  

Trainsplit$score_new<-ifelse(Trainsplit$score>=0.6,1,0)


#Filtering the score variable of Testsplit data into two parts based on 50% criteria.

TestSplit$score_new<-ifelse(TestSplit$score>=0.5,1,0)

head(Trainsplit$score_new)

tail(Trainsplit$score_new)


#Confusion matrix

ConfusionMatrix1<-xtabs(~Trainsplit$score_new+Trainsplit$left)

confusionMatrix2<-xtabs(~TestSplit$score_new+TestSplit$left)

head(TestSplit$score_new)

tail(TestSplit$score_new)


#Install libraries

library(pscl)

pR2(Logitmod)

library(InformationValue)

#Finding the concordance statistic

concor<-Concordance(Trainsplit$left,Trainsplit$score)


#RandomForest

library(randomForest)

library(caret)

modelRF<-randomForest(as.factor(left)~.,data=Trainsplit,do.trace=TRUE)

modelRF$err.rate

importance(modelRF)

varImpPlot(modelRF)


#Predict Random Forest model and Confusion Matrix

Trainsplit$Randomforest<-predict(modelRF, Trainsplit)

TestSplit$Randomforest<-predict(modelRF, TestSplit)

ConfusionMatrix3<-xtabs(~Trainsplit$Randomforest+Trainsplit$left)

ConfusionMatrix4<-xtabs(~TestSplit$Randomforest+TestSplit$left)


#ROC curve plots - Receiver operating characteristics

library(InformationValue)

plotROC(actuals = Trainsplit$left,predicted = as.numeric(fitted(Logitmod)))

ks_plot(actuals = Trainsplit$left,predicted = as.numeric(fitted(Logitmod)))

ks_stat(actuals = Trainsplit$left,predicted = as.numeric(fitted(Logitmod)))


#Decision Tree using Cart algorithm

library(rpart)

library(rpart.plot)

fit1<-rpart(left~.,data=Trainsplit,control=rpart.control(minsplit=30,cp=0.001))

summary(fit1)

rpart.plot(fit1)

prp(fit1)

plotcp(fit1)

fit1$cptable


#ConfusionMatrix of Decision Tree

Trainsplit$cartscore<-predict(fit1,newdata = Trainsplit)

TestSplit$cartscore<-predict(fit1,newdata = TestSplit)


#Filtering the score variable of Trainsplit data into two parts based on 50% criteria.  

Trainsplit$cartscore_new<-ifelse(Trainsplit$score>0.5,1,0)

TestSplit$cartscore_new<-ifelse(TestSplit$score>0.5,1,0)


#ConfusionMatrix

confusionMatrix(Trainsplit$left,Trainsplit$cartscore_new)

confusionMatrix(TestSplit$left,TestSplit$cartscore_new)


#Another method for doing confusion Matrix

xtabs(~Trainsplit$cartscore_new+Trainsplit$left)

xtabs(~TestSplit$cartscore_new+TestSplit$left)


#Decision Tree Naivebayes algorithm

library(e1071)

BayesTrain<-naiveBayes(as.factor(left)~.,data=Trainsplit)

summary(BayesTrain)

Trainsplit$ScoreBayes<-predict(BayesTrain,Trainsplit)

TestSplit$ScoreBayes<-predict(BayesTrain,TestSplit)


#ConfusionMatrix of Decision Tree Naivebayes

ConfMatrix7<-xtabs(~Trainsplit$ScoreBayes+ Trainsplit$left)

ConfMatrix8<-xtabs(~TestSplit$ScoreBayes+ TestSplit$left)


#KNN (K nearest neighbourhood) and Crosstables

library(dummies)

dummy_df<-dummy.data.frame(HRdatasetNew[,c('role_code','salary.code')])

HRdataset1<-HRdatasetNew

HRdataset1<-cbind.data.frame(HRdataset1,dummy_df)

HRdataset1$role_code<-NULL

HRdataset1$salary.code<-NULL

HRdataset1$Work_accident<-as.numeric(HRdataset1$Work_accident)

HRdataset1$promotion_last_5years<-as.numeric(HRdataset1$promotion_last_5years)

要查看或添加评论,请登录

Mr Kumar的更多文章

社区洞察

其他会员也浏览了