Mydata1 <- read.csv("Tayko.csv")
head(Mydata1)
#prints the first few rows of the dataset. You can view the complete dataset on a spreadsheet by double clicking on the file name in the environment window
You may not need in the dataset, the first column (i.e. variable X). It can be dropped using the following codes
Mydata <- within(Mydata1, rm(sequence_number,source_a,source_b,source_c,source_d,source_e,source_m,source_o,source_h,source_r,source_s,source_t,source_u,source_p,source_x,source_w,X1st_update_days_ago,Purchase))
Inspect the dataframe as a table and print variable names.
fix(Mydata)
# outputting data column names
names(Mydata)
# outputting data
write.table(Mydata)
Search if the data has any missing values. Many procedures do not work if the data has missing values or special instructions need to be given when they are present.
##Number of missing values (NAs) in the dataframe
sum(is.na(Mydata))
[1] 0
##To compute the total missing values in each column is to use colSums()
#colSums(is.na(Mydata))
colSums(is.na(Mydata))
US Freq last_update_days_ago
0 0 0
Web.order Gender.male Address_is_res
0 0 0
Spending
0
##Simple ways to deal with missing value https://uc-r.github.io/missing_values
Now the data is ready for analysis. It is always a good idea to start with exploratory data analysis techniques.
#To print the variable names and type of data under each variable
str(Mydata)
'data.frame': 2000 obs. of 7 variables:
$ US : int 1 1 1 1 1 1 1 1 1 1 ...
$ Freq : int 2 0 2 1 1 1 2 1 4 1 ...
$ last_update_days_ago: int 3662 2900 3883 829 869 1995 1498 3397 525 3215 ...
$ Web.order : int 1 1 0 0 0 0 0 0 1 0 ...
$ Gender.male : int 0 1 0 1 0 0 0 1 1 0 ...
$ Address_is_res : int 1 0 0 0 0 1 1 0 0 0 ...
$ Spending : int 128 0 127 0 0 0 0 0 489 174 ...
# Plot the data for each column to get a feel for it
# as well as a histogram to see the data distribution
#loading packages
require("ggplot2")
Loading required package: ggplot2
plot(Mydata$Freq, Mydata$Spending)
plot(Mydata$last_update_days_ago, Mydata$Spending)
For descriptive statistics, the function ‘summary’ can be used. The package ‘pastecs’ is used below for a more detailed output. Do check for missing values and type of variable (must be numeric).
# Compute descriptive statistics
library(pastecs)
res <- stat.desc(Mydata)
round(res, 2)
# Correlation matrix and respective p-values
library("Hmisc")
res2 <- rcorr(as.matrix(Mydata))
res2
US Freq last_update_days_ago Web.order Gender.male
US 1.00 0.03 0.04 0.00 0.03
Freq 0.03 1.00 -0.35 0.10 -0.04
last_update_days_ago 0.04 -0.35 1.00 -0.03 0.02
Web.order 0.00 0.10 -0.03 1.00 -0.01
Gender.male 0.03 -0.04 0.02 -0.01 1.00
Address_is_res 0.02 0.22 -0.21 -0.04 -0.05
Spending 0.00 0.69 -0.26 0.12 -0.02
Address_is_res Spending
US 0.02 0.00
Freq 0.22 0.69
last_update_days_ago -0.21 -0.26
Web.order -0.04 0.12
Gender.male -0.05 -0.02
Address_is_res 1.00 -0.03
Spending -0.03 1.00
n= 2000
P
US Freq last_update_days_ago Web.order Gender.male
US 0.1392 0.0880 0.8561 0.2347
Freq 0.1392 0.0000 0.0000 0.0888
last_update_days_ago 0.0880 0.0000 0.1188 0.4675
Web.order 0.8561 0.0000 0.1188 0.7948
Gender.male 0.2347 0.0888 0.4675 0.7948
Address_is_res 0.3521 0.0000 0.0000 0.0759 0.0324
Spending 0.8764 0.0000 0.0000 0.0000 0.2826
Address_is_res Spending
US 0.3521 0.8764
Freq 0.0000 0.0000
last_update_days_ago 0.0000 0.0000
Web.order 0.0759 0.0000
Gender.male 0.0324 0.2826
Address_is_res 0.2282
Spending 0.2282
Evaluate the linear relationship between two variables by building simple linear regression model
fit1 <- lm(Spending~Freq, data = Mydata)
summary(fit1)
Call:
lm(formula = Spending ~ Freq, data = Mydata)
Residuals:
Min 1Q Median 3Q Max
-465.32 -64.33 -6.33 27.50 1343.84
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -27.500 4.288 -6.414 1.77e-10 ***
Freq 91.831 2.148 42.744 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 135 on 1998 degrees of freedom
Multiple R-squared: 0.4777, Adjusted R-squared: 0.4774
F-statistic: 1827 on 1 and 1998 DF, p-value: < 2.2e-16
confint(fit1) #Confidence interval estimates
2.5 % 97.5 %
(Intercept) -35.90855 -19.09116
Freq 87.61792 96.04454
plot(fit1)
confint(fit1) #Confidence interval estimates
2.5 % 97.5 %
(Intercept) -35.90855 -19.09116
Freq 87.61792 96.04454
#The equatiion is Spending = -27.5 + 91.831*Freq #Multiple R-squared: 0.4777 is not very high. # p-value is quite low and less than 0.05.
fit2 <- lm(Spending~last_update_days_ago, data = Mydata)
summary(fit2)
Call:
lm(formula = Spending ~ last_update_days_ago, data = Mydata)
Residuals:
Min 1Q Median 3Q Max
-192.61 -95.29 -45.15 56.68 1371.34
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 193.237411 8.628528 22.39 <2e-16 ***
last_update_days_ago -0.042046 0.003538 -11.88 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 180.6 on 1998 degrees of freedom
Multiple R-squared: 0.066, Adjusted R-squared: 0.06554
F-statistic: 141.2 on 1 and 1998 DF, p-value: < 2.2e-16
confint(fit2) #Confidence interval estimates
2.5 % 97.5 %
(Intercept) 176.31555606 210.15926689
last_update_days_ago -0.04898495 -0.03510616
plot(fit2)
#The equatiion is Spending = - 193.237411 -0.042046 *last_update_days_ago #Multiple R-squared: 0.066 is very low. # However, p-value is quite low and less than 0.05..
# Multiple Linear Regression
lm.fit=lm(Spending ~ Freq + last_update_days_ago + US + Web.order + Gender.male + Address_is_res, data = Mydata)
summary(lm.fit)
Call:
lm(formula = Spending ~ Freq + last_update_days_ago + US + Web.order +
Gender.male + Address_is_res, data = Mydata)
Residuals:
Min 1Q Median 3Q Max
-435.87 -79.10 -0.17 33.29 1327.19
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.235190 10.888305 0.389 0.69734
Freq 94.567186 2.256709 41.905 < 2e-16 ***
last_update_days_ago -0.007670 0.002761 -2.778 0.00553 **
US -7.060852 7.686109 -0.919 0.35839
Web.order 15.070681 5.941364 2.537 0.01127 *
Gender.male -1.721388 5.850799 -0.294 0.76862
Address_is_res -84.963342 7.301762 -11.636 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 130.4 on 1993 degrees of freedom
Multiple R-squared: 0.514, Adjusted R-squared: 0.5125
F-statistic: 351.3 on 6 and 1993 DF, p-value: < 2.2e-16
#The equatiion is “Spending =4.235190 + 94.56Freq - 0.0076last_update_days_ago - 7.06US + 15.07Web.order - 1.721388Gender.male - 84.96 Address_is_res” #Type of purchases resulting higher Spending are #Customers with higher Freq #Customer with Web Order # Using backward elimination we would first eliminate “Gender,male with highest Pr value.
library(car)
vif(lm.fit)
Freq last_update_days_ago US
1.182859 1.167494 1.005244
Web.order Gender.male Address_is_res
1.015042 1.003961 1.079383
Spending_pred <- predict(lm.fit)
head(Spending_pred)
1 2 3 4 5 6
88.326914 -11.720632 156.524408 83.661345 85.075916 -8.524337
Spending_resid <- residuals(lm.fit)
head(Spending_resid)
1 2 3 4 5 6
39.673086 11.720632 -29.524408 -83.661345 -85.075916 8.524337
# partition data
set.seed(1) # set seed for reproducing the partition
train.index <- sample(c(1:2000), 1600)
Mydata_train <- Mydata[train.index, ]
Mydata_test <- Mydata[-train.index, ]
Now fit the regression model with the train dateset
model_train <- lm(Spending ~ Freq + last_update_days_ago + US + Web.order + Gender.male + Address_is_res, data = Mydata_train)
model_train_summ <- summary(model_train)
model_train_summ$r.squared
[1] 0.4859717
y_test <- Mydata_test$Spending
yhat_test <- predict(model_train, newdata = Mydata_test)
n_test <- length(Mydata_test$Spending)
# test RMSE
rmse <- sqrt(sum((y_test - yhat_test)^2) / n_test)
rmse
[1] 116.5629
Try doing this analysis, starting with one predictor variables. Then sequentially add or drop variables till you find the best combination of predictor variables. Is this the best way to find the bext regression model?
hist(lm.fit$residuals)
NA
NA
##Histogram of model residual is plotted. ##The histogram is a Normal Distributio. This improves accuracy of the prediction.
lmtest::bptest(lm.fit) # Breusch-Pagan test
car::ncvTest(lm.fit)
Are there unusual observations? Interpret and discuss.
#Single variable analysis through box-plots
outlier_values <- boxplot.stats(Mydata$Spending)$out # outlier values.
outlier_values
boxplot(Mydata$Spending, main="Spending", boxwex=0.1)
mtext(paste("Outliers: ", paste(outlier_values, collapse=", ")), cex=0.6)