# Go through the data and understand the attributes to get the info of the variables.
Write a code to clear the environment if needed
rm(list=ls(all=(TRUE)))
Write a code to set the working directory.
setwd("C:/Users/C5215696/Desktop/Data Science/Regression-concepts/Multi-Regression")
getwd()
## [1] "C:/Users/C5215696/Desktop/Data Science/Regression-concepts/Multi-Regression"
Think and load the libraries as and when required in this place only as best practice
Write a code to read the csv file as “data” as per required
customer_data=read.csv("CustomerData_Assignment.csv",header = T)
Write a code to know the names of the attributes
names(customer_data)
## [1] "CustomerID" "City"
## [3] "NoOfChildren" "MinAgeOfChild"
## [5] "MaxAgeOfChild" "Tenure"
## [7] "FrquncyOfPurchase" "NoOfUnitsPurchased"
## [9] "FrequencyOFPlay" "NoOfGamesPlayed"
## [11] "NoOfGamesBought" "FavoriteChannelOfTransaction"
## [13] "FavoriteGame" "TotalRevenueGenerated"
colnames(customer_data)
## [1] "CustomerID" "City"
## [3] "NoOfChildren" "MinAgeOfChild"
## [5] "MaxAgeOfChild" "Tenure"
## [7] "FrquncyOfPurchase" "NoOfUnitsPurchased"
## [9] "FrequencyOFPlay" "NoOfGamesPlayed"
## [11] "NoOfGamesBought" "FavoriteChannelOfTransaction"
## [13] "FavoriteGame" "TotalRevenueGenerated"
Write a code to find the dimensions of the data
dim(customer_data)
## [1] 3209 14
Write a code to see the head and tail of the dataset atleast 10rows
tail(customer_data,n = 10)
## CustomerID City NoOfChildren MinAgeOfChild MaxAgeOfChild Tenure
## 3200 4200 1 1 6 6 352
## 3201 4201 1 2 7 8 313
## 3202 4202 1 3 2 6 424
## 3203 4203 1 2 5 7 424
## 3204 4204 1 1 4 4 431
## 3205 4205 1 2 4 6 365
## 3206 4206 1 2 2 5 348
## 3207 4207 1 2 5 7 341
## 3208 4208 1 2 6 7 368
## 3209 4209 1 3 2 6 389
## FrquncyOfPurchase NoOfUnitsPurchased FrequencyOFPlay NoOfGamesPlayed
## 3200 27 24 2042 194
## 3201 15 15 952 113
## 3202 20 12 2038 177
## 3203 32 29 5933 382
## 3204 26 26 1166 72
## 3205 16 16 827 78
## 3206 20 20 2933 294
## 3207 16 13 1250 126
## 3208 18 14 1364 122
## 3209 25 17 475 111
## NoOfGamesBought FavoriteChannelOfTransaction FavoriteGame
## 3200 23 Favorite Uniform
## 3201 10 Favorite Uniform
## 3202 20 Uniform Uniform
## 3203 29 Favorite Uniform
## 3204 26 Favorite Uniform
## 3205 16 Favorite Uniform
## 3206 13 Favorite Uniform
## 3207 16 Uniform Uniform
## 3208 18 Favorite Uniform
## 3209 25 Favorite Uniform
## TotalRevenueGenerated
## 3200 249.96
## 3201 175.51
## 3202 117.26
## 3203 261.29
## 3204 205.00
## 3205 132.50
## 3206 223.23
## 3207 120.00
## 3208 161.50
## 3209 137.50
head(customer_data,n=10)
## CustomerID City NoOfChildren MinAgeOfChild MaxAgeOfChild Tenure
## 1 1001 1 2 3 8 210
## 2 1002 1 2 3 6 442
## 3 1003 1 4 3 5 424
## 4 1004 1 1 6 6 261
## 5 1005 1 3 6 9 422
## 6 1006 1 2 3 4 378
## 7 1007 1 3 8 12 369
## 8 1008 1 2 6 8 404
## 9 1009 1 4 6 9 420
## 10 1010 2 3 5 6 333
## FrquncyOfPurchase NoOfUnitsPurchased FrequencyOFPlay NoOfGamesPlayed
## 1 11 11 2344 108
## 2 20 20 245 22
## 3 18 18 1059 130
## 4 11 9 365 34
## 5 44 31 1066 102
## 6 16 16 228 12
## 7 25 15 75 2
## 8 13 12 1488 118
## 9 20 16 2743 163
## 10 15 15 1967 56
## NoOfGamesBought FavoriteChannelOfTransaction FavoriteGame
## 1 10 Uniform Uniform
## 2 7 Favorite Uniform
## 3 18 Favorite Uniform
## 4 11 Favorite Uniform
## 5 44 Uniform Uniform
## 6 16 Favorite Favorite
## 7 25 Favorite Favorite
## 8 13 Favorite Uniform
## 9 16 Uniform Uniform
## 10 15 Favorite Uniform
## TotalRevenueGenerated
## 1 107.51
## 2 382.40
## 3 135.01
## 4 125.00
## 5 335.05
## 6 150.00
## 7 127.50
## 8 122.50
## 9 164.96
## 10 112.62
Write a code to see the data types of the attributes
str(customer_data)
## 'data.frame': 3209 obs. of 14 variables:
## $ CustomerID : int 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 ...
## $ City : int 1 1 1 1 1 1 1 1 1 2 ...
## $ NoOfChildren : int 2 2 4 1 3 2 3 2 4 3 ...
## $ MinAgeOfChild : int 3 3 3 6 6 3 8 6 6 5 ...
## $ MaxAgeOfChild : int 8 6 5 6 9 4 12 8 9 6 ...
## $ Tenure : int 210 442 424 261 422 378 369 404 420 333 ...
## $ FrquncyOfPurchase : int 11 20 18 11 44 16 25 13 20 15 ...
## $ NoOfUnitsPurchased : int 11 20 18 9 31 16 15 12 16 15 ...
## $ FrequencyOFPlay : int 2344 245 1059 365 1066 228 75 1488 2743 1967 ...
## $ NoOfGamesPlayed : int 108 22 130 34 102 12 2 118 163 56 ...
## $ NoOfGamesBought : int 10 7 18 11 44 16 25 13 16 15 ...
## $ FavoriteChannelOfTransaction: Factor w/ 2 levels "Favorite","Uniform": 2 1 1 1 2 1 1 1 2 1 ...
## $ FavoriteGame : Factor w/ 3 levels "Favorite","NONE",..: 3 3 3 3 3 1 1 3 3 3 ...
## $ TotalRevenueGenerated : num 108 382 135 125 335 ...
Write a code to remove the unuseful variables and store the data
#Consider
city_data=customer_data$City
customer_data=within(customer_data,rm('City','CustomerID'))
##cust_data=customer_data[,-which(names(customer_data) == "City")]
Write a code to observe the variables and convert them into the required formats
cat_attr=c("FavoriteGame","FavoriteChannelOfTransaction")
num_attr=c(setdiff(names(customer_data),cat_attr))
customer_data[num_attr]=data.frame(apply(customer_data[num_attr],2, function(x)as.numeric(x)))
#or customer_data[num_attr]=data.frame(lapply(customer_data[num_attr],2,FUN = as.numeric))
Check the changes again using str() command
str(customer_data)
## 'data.frame': 3209 obs. of 12 variables:
## $ NoOfChildren : num 2 2 4 1 3 2 3 2 4 3 ...
## $ MinAgeOfChild : num 3 3 3 6 6 3 8 6 6 5 ...
## $ MaxAgeOfChild : num 8 6 5 6 9 4 12 8 9 6 ...
## $ Tenure : num 210 442 424 261 422 378 369 404 420 333 ...
## $ FrquncyOfPurchase : num 11 20 18 11 44 16 25 13 20 15 ...
## $ NoOfUnitsPurchased : num 11 20 18 9 31 16 15 12 16 15 ...
## $ FrequencyOFPlay : num 2344 245 1059 365 1066 ...
## $ NoOfGamesPlayed : num 108 22 130 34 102 12 2 118 163 56 ...
## $ NoOfGamesBought : num 10 7 18 11 44 16 25 13 16 15 ...
## $ FavoriteChannelOfTransaction: Factor w/ 2 levels "Favorite","Uniform": 2 1 1 1 2 1 1 1 2 1 ...
## $ FavoriteGame : Factor w/ 3 levels "Favorite","NONE",..: 3 3 3 3 3 1 1 3 3 3 ...
## $ TotalRevenueGenerated : num 108 382 135 125 335 ...
Write a code to get the numerical attributes out and store them as num_df and seperate categorical attributes.
cate_df=customer_data[,cat_attr]
num_df=subset(customer_data,select=num_attr)
Write a Code to seperate the target
target_attr=customer_data["TotalRevenueGenerated"]
num_df$TotalRevenueGenerated=NULL
Write a code for Normalizing the num_df
library(vegan)
## Warning: package 'vegan' was built under R version 3.3.3
## Loading required package: permute
## Warning: package 'permute' was built under R version 3.3.3
## Loading required package: lattice
## This is vegan 2.4-3
str(num_df)
## 'data.frame': 3209 obs. of 9 variables:
## $ NoOfChildren : num 2 2 4 1 3 2 3 2 4 3 ...
## $ MinAgeOfChild : num 3 3 3 6 6 3 8 6 6 5 ...
## $ MaxAgeOfChild : num 8 6 5 6 9 4 12 8 9 6 ...
## $ Tenure : num 210 442 424 261 422 378 369 404 420 333 ...
## $ FrquncyOfPurchase : num 11 20 18 11 44 16 25 13 20 15 ...
## $ NoOfUnitsPurchased: num 11 20 18 9 31 16 15 12 16 15 ...
## $ FrequencyOFPlay : num 2344 245 1059 365 1066 ...
## $ NoOfGamesPlayed : num 108 22 130 34 102 12 2 118 163 56 ...
## $ NoOfGamesBought : num 10 7 18 11 44 16 25 13 16 15 ...
num_df=decostand(num_df,method = "standardize")
Write a code to observe the correlation between the attributes
cor(num_df)
## NoOfChildren MinAgeOfChild MaxAgeOfChild Tenure
## NoOfChildren 1.0000000 -0.1791759583 0.189001961 0.08534060
## MinAgeOfChild -0.1791760 1.0000000000 0.367127367 -0.05582027
## MaxAgeOfChild 0.1890020 0.3671273667 1.000000000 -0.04032911
## Tenure 0.0853406 -0.0558202691 -0.040329109 1.00000000
## FrquncyOfPurchase 0.1376303 -0.0175678432 -0.006212834 0.19334370
## NoOfUnitsPurchased 0.1370396 -0.0061912434 -0.003925588 0.19241309
## FrequencyOFPlay 0.1650357 0.0066897624 0.012095286 0.24180938
## NoOfGamesPlayed 0.2133424 0.0001039156 0.027946580 0.27369597
## NoOfGamesBought 0.1313984 -0.0077484225 -0.007062492 0.18828481
## FrquncyOfPurchase NoOfUnitsPurchased FrequencyOFPlay
## NoOfChildren 0.137630281 0.137039620 0.165035716
## MinAgeOfChild -0.017567843 -0.006191243 0.006689762
## MaxAgeOfChild -0.006212834 -0.003925588 0.012095286
## Tenure 0.193343704 0.192413092 0.241809378
## FrquncyOfPurchase 1.000000000 0.934130532 0.279957433
## NoOfUnitsPurchased 0.934130532 1.000000000 0.311981695
## FrequencyOFPlay 0.279957433 0.311981695 1.000000000
## NoOfGamesPlayed 0.397566861 0.436149179 0.740204301
## NoOfGamesBought 0.947787464 0.868374420 0.286943164
## NoOfGamesPlayed NoOfGamesBought
## NoOfChildren 0.2133424389 0.131398358
## MinAgeOfChild 0.0001039156 -0.007748422
## MaxAgeOfChild 0.0279465799 -0.007062492
## Tenure 0.2736959714 0.188284813
## FrquncyOfPurchase 0.3975668611 0.947787464
## NoOfUnitsPurchased 0.4361491791 0.868374420
## FrequencyOFPlay 0.7402043013 0.286943164
## NoOfGamesPlayed 1.0000000000 0.399178320
## NoOfGamesBought 0.3991783197 1.000000000
Write a code to plot the corrplot of correlation between the attributes
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.3.3
corrplot(cor(num_df),method = "number")
Write a code to combine the numerical,categorical data along with the target
str(customer_data)
## 'data.frame': 3209 obs. of 12 variables:
## $ NoOfChildren : num 2 2 4 1 3 2 3 2 4 3 ...
## $ MinAgeOfChild : num 3 3 3 6 6 3 8 6 6 5 ...
## $ MaxAgeOfChild : num 8 6 5 6 9 4 12 8 9 6 ...
## $ Tenure : num 210 442 424 261 422 378 369 404 420 333 ...
## $ FrquncyOfPurchase : num 11 20 18 11 44 16 25 13 20 15 ...
## $ NoOfUnitsPurchased : num 11 20 18 9 31 16 15 12 16 15 ...
## $ FrequencyOFPlay : num 2344 245 1059 365 1066 ...
## $ NoOfGamesPlayed : num 108 22 130 34 102 12 2 118 163 56 ...
## $ NoOfGamesBought : num 10 7 18 11 44 16 25 13 16 15 ...
## $ FavoriteChannelOfTransaction: Factor w/ 2 levels "Favorite","Uniform": 2 1 1 1 2 1 1 1 2 1 ...
## $ FavoriteGame : Factor w/ 3 levels "Favorite","NONE",..: 3 3 3 3 3 1 1 3 3 3 ...
## $ TotalRevenueGenerated : num 108 382 135 125 335 ...
str(target_attr)
## 'data.frame': 3209 obs. of 1 variable:
## $ TotalRevenueGenerated: num 108 382 135 125 335 ...
str(cate_df)
## 'data.frame': 3209 obs. of 2 variables:
## $ FavoriteGame : Factor w/ 3 levels "Favorite","NONE",..: 3 3 3 3 3 1 1 3 3 3 ...
## $ FavoriteChannelOfTransaction: Factor w/ 2 levels "Favorite","Uniform": 2 1 1 1 2 1 1 1 2 1 ...
str(num_df)
## 'data.frame': 3209 obs. of 9 variables:
## $ NoOfChildren : num -0.124 -0.124 1.808 -1.09 0.842 ...
## $ MinAgeOfChild : num -0.528 -0.528 -0.528 0.28 0.28 ...
## $ MaxAgeOfChild : num 0.00106 -0.22662 -0.34046 -0.22662 0.11491 ...
## $ Tenure : num -1.519 1.044 0.845 -0.956 0.823 ...
## $ FrquncyOfPurchase : num -0.624 0.442 0.205 -0.624 3.285 ...
## $ NoOfUnitsPurchased: num -0.513 0.74 0.462 -0.791 2.272 ...
## $ FrequencyOFPlay : num 0.428 -0.731 -0.281 -0.665 -0.277 ...
## $ NoOfGamesPlayed : num 0.1616 -0.8054 0.409 -0.6705 0.0941 ...
## $ NoOfGamesBought : num -0.543 -0.885 0.369 -0.429 3.332 ...
## - attr(*, "decostand")= chr "standardize"
combined_data=cbind(target_attr,num_df,cate_df)
Write a code to set the seed and comment why it is used.
set.seed(29)
#set the seed so that the random number generated should be same every time you divide the data into train and test.
Write a code to get the train rows using sample
train_rows=sample(x=1:nrow(combined_data),size = 0.7*nrow(combined_data))
Write a code to get the train and test
train_data=combined_data[train_rows,]
test_data=combined_data[-train_rows,]
Write a code to just plot the graphs between attributes and targets
par(mfrow=c(3,3))
plot(combined_data$NoOfChildren,combined_data$TotalRevenueGenerated,xlab="No Of Childern",ylab = "Total Revenue")
plot(combined_data$MinAgeOfChild,combined_data$TotalRevenueGenerated,xlab="MinAgeOfChild",ylab = "Total Revenue")
plot(combined_data$MaxAgeOfChild,combined_data$TotalRevenueGenerated,xlab="MaxAgeOfChild",ylab = "Total Revenue")
plot(combined_data$Tenure,combined_data$TotalRevenueGenerated,xlab="Tenure",ylab = "Total Revenue")
plot(combined_data$FrquncyOfPurchase,combined_data$TotalRevenueGenerated,xlab="FrquncyOfPurchase",ylab = "Total Revenue")
plot(combined_data$NoOfUnitsPurchased,combined_data$TotalRevenueGenerated,xlab="NoOfUnitsPurchased",ylab = "Total Revenue")
plot(combined_data$FrequencyOFPlay,combined_data$TotalRevenueGenerated,xlab="FrequencyOFPlay",ylab = "Total Revenue")
plot(combined_data$NoOfGamesPlayed,combined_data$TotalRevenueGenerated,xlab="NoOfGamesPlayed",ylab = "Total Revenue")
plot(combined_data$NoOfGamesBought,combined_data$TotalRevenueGenerated,xlab="NoOfGamesBought",ylab = "Total Revenue")
Write a code to form a linear regression model
linReg_model=lm(formula=TotalRevenueGenerated~NoOfChildren+MinAgeOfChild+MaxAgeOfChild+Tenure+FrquncyOfPurchase+NoOfUnitsPurchased+FrequencyOFPlay+NoOfGamesPlayed+NoOfGamesBought,data=train_data)
Write a code to plot the model
par(mfrow = c(2,2))
plot(linReg_model)
Write a code to check the summary of the model
summary(linReg_model)
##
## Call:
## lm(formula = TotalRevenueGenerated ~ NoOfChildren + MinAgeOfChild +
## MaxAgeOfChild + Tenure + FrquncyOfPurchase + NoOfUnitsPurchased +
## FrequencyOFPlay + NoOfGamesPlayed + NoOfGamesBought, data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -168.47 -27.96 -4.55 22.48 318.81
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 169.7797 0.9409 180.441 < 2e-16 ***
## NoOfChildren 3.3141 1.0401 3.186 0.00146 **
## MinAgeOfChild 13.9216 2.2782 6.111 1.17e-09 ***
## MaxAgeOfChild -1.8467 1.0026 -1.842 0.06563 .
## Tenure -0.4760 0.9851 -0.483 0.62896
## FrquncyOfPurchase 79.9014 4.1560 19.226 < 2e-16 ***
## NoOfUnitsPurchased 73.5796 2.6271 28.008 < 2e-16 ***
## FrequencyOFPlay 4.3019 1.4323 3.003 0.00270 **
## NoOfGamesPlayed -4.5316 1.5318 -2.958 0.00312 **
## NoOfGamesBought -98.4499 3.0501 -32.277 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 44.39 on 2236 degrees of freedom
## Multiple R-squared: 0.7186, Adjusted R-squared: 0.7175
## F-statistic: 634.5 on 9 and 2236 DF, p-value: < 2.2e-16
Write a code to predict the value of the target on the linear model
attr_without_target=num_attr[-length(num_attr)]
pred_values=predict(linReg_model,test_data[attr_without_target])
Write a code to use stepAIC
library(MASS)
aic_model=stepAIC(linReg_model,direction = 'both')
## Start: AIC=17047.71
## TotalRevenueGenerated ~ NoOfChildren + MinAgeOfChild + MaxAgeOfChild +
## Tenure + FrquncyOfPurchase + NoOfUnitsPurchased + FrequencyOFPlay +
## NoOfGamesPlayed + NoOfGamesBought
##
## Df Sum of Sq RSS AIC
## - Tenure 1 460 4405482 17046
## <none> 4405022 17048
## - MaxAgeOfChild 1 6683 4411705 17049
## - NoOfGamesPlayed 1 17242 4422264 17055
## - FrequencyOFPlay 1 17772 4422793 17055
## - NoOfChildren 1 20001 4425023 17056
## - MinAgeOfChild 1 73564 4478586 17083
## - FrquncyOfPurchase 1 728178 5133200 17389
## - NoOfUnitsPurchased 1 1545398 5950420 17721
## - NoOfGamesBought 1 2052433 6457455 17905
##
## Step: AIC=17045.95
## TotalRevenueGenerated ~ NoOfChildren + MinAgeOfChild + MaxAgeOfChild +
## FrquncyOfPurchase + NoOfUnitsPurchased + FrequencyOFPlay +
## NoOfGamesPlayed + NoOfGamesBought
##
## Df Sum of Sq RSS AIC
## <none> 4405482 17046
## - MaxAgeOfChild 1 6505 4411987 17047
## + Tenure 1 460 4405022 17048
## - FrequencyOFPlay 1 17434 4422916 17053
## - NoOfGamesPlayed 1 17924 4423406 17053
## - NoOfChildren 1 20047 4425529 17054
## - MinAgeOfChild 1 76063 4481545 17082
## - FrquncyOfPurchase 1 727756 5133238 17387
## - NoOfUnitsPurchased 1 1547166 5952648 17720
## - NoOfGamesBought 1 2052658 6458140 17903
Write a code to predict it on stepAIC
predict_aic_values=predict(aic_model,test_data[attr_without_target])
Write a code to check the multicollinearity in the lm model
library(car)
## Warning: package 'car' was built under R version 3.3.3
vif(linReg_model)
## NoOfChildren MinAgeOfChild MaxAgeOfChild
## 1.265423 1.201447 1.071255
## Tenure FrquncyOfPurchase NoOfUnitsPurchased
## 1.119430 19.920974 7.949876
## FrequencyOFPlay NoOfGamesPlayed NoOfGamesBought
## 2.216765 2.515500 10.589748
Write a code to check the plots of the models
par(mfrow=c(2,2))
plot(aic_model)
Write a code to check the multicollinearity problem
vif(aic_model)
## NoOfChildren MinAgeOfChild MaxAgeOfChild
## 1.265354 1.184135 1.067739
## FrquncyOfPurchase NoOfUnitsPurchased FrequencyOFPlay
## 19.900110 7.946468 2.203238
## NoOfGamesPlayed NoOfGamesBought
## 2.493808 10.589633
#Mean Absolute Error (MAE)
mae <- function(actual, predicted){
error <- actual - predicted
mean(abs(error))
}
#Mean Squared Error (MSE)
mse <- function(actual, predicted){
error <- actual - predicted
mean(error^2)
}
#Root Mean Squared Error (RMSE)
rmse <- function(actual, predicted){
error <- actual - predicted
sqrt(mean(error^2))
}
#Mean Absolute Percentage Error (MAPE)
mape <- function(actual, predicted){
error <- actual - predicted
mean(abs(error/actual))*100
}
Write a code to evaluate the error in the prediction of AIC
mae(test_data$TotalRevenueGenerated, predict_aic_values)
## [1] 32.65343
mse(test_data$TotalRevenueGenerated, predict_aic_values)
## [1] 2181.524
rmse(test_data$TotalRevenueGenerated, predict_aic_values)
## [1] 46.70679
mape(test_data$TotalRevenueGenerated, predict_aic_values)
## [1] 19.96523
# OR
library(DMwR)
## Warning: package 'DMwR' was built under R version 3.3.3
## Loading required package: grid
regr.eval(test_data$TotalRevenueGenerated, predict_aic_values)
## mae mse rmse mape
## 32.6534295 2181.5242460 46.7067901 0.1996523
Write a code to evaluate the error on the prediction of Linear Regression
mae(test_data$TotalRevenueGenerated, pred_values)
## [1] 32.62928
mse(test_data$TotalRevenueGenerated, pred_values)
## [1] 2170.543
rmse(test_data$TotalRevenueGenerated, pred_values)
## [1] 46.58909
mape(test_data$TotalRevenueGenerated, pred_values)
## [1] 19.9562
# OR
regr.eval(test_data$TotalRevenueGenerated, pred_values)
## mae mse rmse mape
## 32.629279 2170.543222 46.589089 0.199562