################################## # Analysis of Bank Marketing Data set: https://archive.ics.uci.edu/ml/datasets/Bank+Marketing# # Reference: [Moro et al., 2014] S. Moro, P. Cortez and P. Rita. A Data-Driven Approach to Predict the Success of Bank Telemarketing. Decision Support Systems, Elsevier, 62:22-31, June 2014 # Author: DIana Marek # Date: December 2016 ################################## # list workspace ls() # Reset R'brain rm(list=ls()) # check wd getwd() #set wd setwd("/Users/dmarek/EducationSIB/Courses_2016/Nespresso_R_course") # confirm wd getwd() # Read the data bank_data<-read.csv2("bank.csv") # bank_data<- read.csv("bank.csv", sep=";") # this is an alternative # bank_data<- read.table("bank.csv", header=TRUE, sep=";") # this is an alternative View(bank_data) # equivalent to doble click on the object name #customers<-read.csv2('FR_Customers_sample_small.csv') orders<-read.csv2('FR_Orders_sample_small.csv') hist(as.numeric(orders$CHF_REVENUE)) # Structure, info about bank_data class(bank_data) head(bank_data) dim(bank_data) str(bank_data) colnames(bank_data) class(bank_data$age) class(bank_data$balance) class(bank_data$job) levels(bank_data$job) table(bank_data$education) #counts occurrences for each level of the factor with(bank_data,table(education)) # equivalent to the above command prop.table(bank_data$education) #counts can be converted to proportions with prop.table() # Basic stats summary(bank_data) # Subsetting subset(bank_data,month=="dec") # keeps only the customers contacted in december subset(bank_data,month=="dec" & (education=="tertiary" | education== "secondary")) # keeps only the customers contacted in december and with a secondary or tertiary education # Customised summaries by(bank_data$balance, bank_data$job, mean) aggregate(data.frame(balance=bank_data$balance), list(job=bank_data$job), range) # to change labels of the output table #Adding data (column) gender<-sample(c(1,2), dim(bank_data)[1], replace=TRUE) gender<-as.factor(gender) bank_data_updated <- cbind (bank_data,gender) #Exporting data write.table(bank_data_updated, file="bank_data_updated.csv", quote=FALSE, sep=";",row.names=FALSE) # Graphics #Hist #Raw scale hist(bank_data$balance,breaks=100,xlab="Customer balance", main= "Customer balance distribution", freq = FALSE, col= "green", ylim=c(0, 8e-04)) lines(density(bank_data$balance),col="orange3",lwd=2) #log scale hist(log10(bank_data$balance[bank_data$balance>0]),breaks=100,xlab="Customer balance (log scale)", main= "Customer balance distribution (log scale)", freq = FALSE, col= "green"); lines(density(log10(bank_data$balance[bank_data$balance>0])),col="orange3",lwd=2, na.rm=FALSE) #boxplot boxplot(duration ~ contact, data=bank_data, main= "Call duration per contact communication type", xlab = "Contact communication type", ylab= "Call duration (sec)") points(duration ~ contact, data=bank_data, col="red", pch = 3) #adds the actual data points onto the plot boxplot(duration ~ contact, data=bank_data, main= "Call duration per contact communication type", ylab = "Contact communication type", xlab= "Call duration (sec)", horizontal=TRUE, # horizontal plot las=0, # default style of axis labels (always parallel to axis), see more ?par cex.axis=1.2, cex.lab=1.5, cex.main=2) # diff magnification for axis annotation, labels and titles# #plot with regression line and correlation, colouring by factor plot(bank_data$age,bank_data$balance, type="p", col="red", main="Balance as function of customers' age" , xlab="Age (years)", ylab="Balance (euros)") abline(lm(bank_data$balance~bank_data$age),col="blue") legend("topright", legend= c("clients","fit"), pch= c("o","-"),col = c(2, 4),bg = "gray90") plot(bank_data$age,bank_data$balance, type="p", main="Balance as function of customers' age and marital status" , xlab="Age (years)", ylab="Balance (euros)", col = c("red", "green3", "blue")[bank_data$marital]) legend("topright", legend= levels(bank_data$marital), pch= c("o","o","o"),col = c("red", "green3", "blue"),bg = "gray90") shapiro.test(bank_data$balance) # data not normal shapiro.test(bank_data$age) # data not normal plot(log10(bank_data$age[bank_data$balance>0]),log10(bank_data$balance[bank_data$balance>0]), type="p", col="red", main="Log balance as function of log customers' age" , xlab="Log10 age (years)", ylab="Log10 balance (euros)") # removing negative and null balances abline(lm(log10(bank_data$balance[bank_data$balance>0])~log10(bank_data$age[bank_data$balance>0])),col="blue") shapiro.test(log10(bank_data$balance[bank_data$balance>0])) # data not normal but better shapiro.test(log10(bank_data$age)) # data not normal but better cor.test(bank_data$age,bank_data$balance, method="spearman") # 0.08 signficant correlation cor.test(bank_data$age,bank_data$balance) #####ggplot2 library("ggplot2") ###scatter plots balance_age_plot<-ggplot(data=bank_data, aes(x=age, y=balance)) balance_age_plot + geom_point() balance_age_plot + geom_point(alpha=0.3) balance_age_plot + geom_point(alpha=0.3, color="blue") balance_age_plot + geom_point(alpha=0.3, color="blue", size=3) # Colouring and shaping by levels of the factor balance_age_plot + geom_point(alpha=0.3, aes(color=marital)) + ggtitle("Account balance as function of age and marital status") + theme(plot.title = element_text(size=10, face="bold", margin= margin(10,0,10,0))) + theme(plot.title = element_text(hjust = 0.45)) #remove bakground panel balance_age_plot + geom_point(alpha=0.3, aes(color=marital, shape=marital)) + ggtitle("Account balance as function of age and marital status") + theme(plot.title = element_text(size=10, face="bold", margin= margin(10,0,10,0))) + theme(plot.title = element_text(hjust = 0.45)) + theme(panel.background = element_rect(fill = "white")) #add regression line balance_age_plot + geom_point(alpha=0.3, aes(color=marital, shape=marital)) + ggtitle("Account balance as function of age and marital status") + theme(plot.title = element_text(size=10, face="bold", margin= margin(10,0,10,0), hjust = 0.45)) + stat_smooth(method="lm") #boxplot duration_contact_boxplot<-ggplot(data=bank_data, aes(x=contact, y=duration)) duration_contact_boxplot + geom_boxplot() duration_contact_boxplot + geom_boxplot(alpha=1) duration_contact_boxplot + geom_boxplot(alpha=0) duration_contact_boxplot + geom_boxplot(alpha=0) + geom_jitter(alpha = 0.3, color = "tomato") duration_contact_boxplot + geom_jitter(alpha = 0.3, color = "tomato") + geom_boxplot(alpha=0) + ggtitle("Call duration as function of contact type") #histogram #frequencies balance_histo<-ggplot(data=bank_data, aes(x=balance)) + geom_histogram() + ggtitle("Account balance distribution") #density balance_histo_density<-ggplot(data=bank_data, aes(x=balance,..density..)) + geom_histogram(bins=70, colour="black",fill ="green") + geom_density(color="orange") + ggtitle("Account balance distribution") #saving ggplot ggsave(balance_histo_density, file="hist_density_ggplot.pdf", width=10, height=5)