cc.wg <- read.csv(file.choose()) # choose the Calories_Consumed.csv data set
View(cc.wg)
# 14 Observations of 2 variables
# Scatter Diagram (Plot x,y)
plot(cc.wg$Calories.Consumed,cc.wg$Weight.gained..grams.)

# Other Exploratory data analysis and Plots
boxplot(cc.wg)

hist(cc.wg$Calories.Consumed)

hist(cc.wg$Weight.gained..grams.)

summary(cc.wg)
## Weight.gained..grams. Calories.Consumed
## Min. : 62.0 Min. :1400
## 1st Qu.: 114.5 1st Qu.:1728
## Median : 200.0 Median :2250
## Mean : 357.7 Mean :2341
## 3rd Qu.: 537.5 3rd Qu.:2775
## Max. :1100.0 Max. :3900
# Correlation coefficient value for Calories Consumes and Weight Gained
cc<- cc.wg$Calories.Consumed
wg <- cc.wg$Weight.gained..grams.
cor(wg,cc)
## [1] 0.946991
# If |r| is greater than 0.85 then Co-relation is Strong(Correlation Co-efficient = 0.946991).
# This has a Strong Co-relation
# Simple model without using any transformation
reg<-lm(wg~cc)
summary(reg)
##
## Call:
## lm(formula = wg ~ cc)
##
## Residuals:
## Min 1Q Median 3Q Max
## -158.67 -107.56 36.70 81.68 165.53
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -625.75236 100.82293 -6.206 4.54e-05 ***
## cc 0.42016 0.04115 10.211 2.86e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 111.6 on 12 degrees of freedom
## Multiple R-squared: 0.8968, Adjusted R-squared: 0.8882
## F-statistic: 104.3 on 1 and 12 DF, p-value: 2.856e-07
# Probability value should be less than 0.05(4.54e-05)
# The multiple-R-Squared Value is 0.8968 which is greater 0.8(In General)
# The Probability Value for F-Statistic is 2.85e-07(Overall Probability Model is also less than 0.05)
confint(reg,level = 0.95) # confidence interval
## 2.5 % 97.5 %
## (Intercept) -845.4266546 -406.0780569
## cc 0.3305064 0.5098069
# The above code will get you 2 equations
# 1 to caliculate the lower range and other for upper range
# Function to Predict the above model
predict(reg,interval="predict")
## Warning in predict.lm(reg, interval = "predict"): predictions on current data refer to _future_ responses
## fit lwr upr
## 1 4.482599 -258.20569 267.1709
## 2 340.607908 88.93791 592.2779
## 3 802.780209 533.81393 1071.7465
## 4 298.592245 46.63271 550.5518
## 5 424.639236 172.59086 676.6876
## 6 46.498263 -213.75953 306.7561
## 7 -37.533065 -302.93258 227.8664
## 8 172.545254 -82.18110 427.2716
## 9 550.686227 295.69632 805.6761
## 10 1012.858527 724.99432 1300.7227
## 11 75.909227 -182.81852 334.6370
## 12 172.545254 -82.18110 427.2716
## 13 508.670563 254.97398 762.3671
## 14 634.717554 376.22600 893.2091
# predict(reg,type="prediction")
# R-squared value for the above model is 0.8968
# we may have to do transformation of variables for better R-squared value
# Applying transformations
# Logarthmic transformation
reg_log<-lm(wg~log(cc)) # Regression using logarthmic transformation
summary(reg_log)
##
## Call:
## lm(formula = wg ~ log(cc))
##
## Residuals:
## Min 1Q Median 3Q Max
## -187.44 -142.96 23.13 113.20 213.82
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6955.7 1030.9 -6.747 2.05e-05 ***
## log(cc) 948.4 133.6 7.100 1.25e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 152.3 on 12 degrees of freedom
## Multiple R-squared: 0.8077, Adjusted R-squared: 0.7917
## F-statistic: 50.4 on 1 and 12 DF, p-value: 1.248e-05
confint(reg_log,level=0.95)
## 2.5 % 97.5 %
## (Intercept) -9201.8063 -4709.494
## log(cc) 657.3251 1239.418
predict(reg_log,interval="predict")
## Warning in predict.lm(reg_log, interval = "predict"): predictions on current data refer to _future_ responses
## fit lwr upr
## 1 -19.99870 -382.5178898 342.5205
## 2 385.37711 41.7849717 728.9693
## 3 756.06367 391.4700627 1120.6573
## 4 343.22032 -0.2957275 686.7364
## 5 464.45388 119.4081720 809.4996
## 6 41.20781 -315.7491910 398.1648
## 7 -85.42959 -454.8597180 284.0005
## 8 204.18573 -142.5179686 550.8894
## 9 571.93160 222.2096884 921.6535
## 10 886.18133 506.3308457 1266.0318
## 11 81.81708 -271.9519877 435.5861
## 12 204.18573 -142.5179686 550.8894
## 13 537.44155 189.5540023 885.3291
## 14 637.36248 283.3161385 991.4088
# R-squared value for the above model is 0.8077
# Adjusted R-squared: 0.7917
# we may have to do different transformation better R-squared value
# Applying different transformations
# Exponential model
reg_exp<-lm(log(wg)~cc) # regression using Exponential model
summary(reg_exp)
##
## Call:
## lm(formula = log(wg) ~ cc)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.86537 -0.10532 0.02462 0.13467 0.42632
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.8386724 0.2994581 9.479 6.36e-07 ***
## cc 0.0011336 0.0001222 9.276 8.02e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3314 on 12 degrees of freedom
## Multiple R-squared: 0.8776, Adjusted R-squared: 0.8674
## F-statistic: 86.04 on 1 and 12 DF, p-value: 8.018e-07
confint(reg_exp,level=0.95)
## 2.5 % 97.5 %
## (Intercept) 2.1862091856 3.491135698
## cc 0.0008673238 0.001399871
exp(predict(reg_exp,interval="predict"))
## Warning in predict.lm(reg_exp, interval = "predict"): predictions on current data refer to _future_ responses
## fit lwr upr
## 1 93.60358 42.89897 204.2387
## 2 231.81660 109.77706 489.5279
## 3 806.66119 362.86703 1793.2251
## 4 206.97268 97.92794 437.4409
## 5 290.80881 137.55822 614.7925
## 6 104.83926 48.39645 227.1090
## 7 83.57203 37.99427 183.8247
## 8 147.30534 69.12627 313.9018
## 9 408.60351 191.59614 871.3997
## 10 1421.83342 604.68380 3343.2519
## 11 113.49743 52.63192 244.7501
## 12 147.30534 69.12627 313.9018
## 13 364.81323 171.72105 775.0284
## 14 512.58408 237.86633 1104.5802
# R-squared value has increased from 0.8776
# Adjusted R SQuare Value - 0.8674
# Higher the R-sqaured value - Better chances of getting good model
# for Calories Consumed and Weight Gained.
# Quadratic model
cc.wg[,"CC_sq"] = cc*cc
# Quadratic model
quad_mod <- lm(wg~cc+I(cc^2),data=cc.wg)
summary(quad_mod)
##
## Call:
## lm(formula = wg ~ cc + I(cc^2), data = cc.wg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -90.321 -63.843 -3.609 52.120 120.222
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.033e+02 2.436e+02 0.834 0.42185
## cc -2.919e-01 2.021e-01 -1.444 0.17653
## I(cc^2) 1.395e-04 3.918e-05 3.561 0.00447 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 79.43 on 11 degrees of freedom
## Multiple R-squared: 0.9521, Adjusted R-squared: 0.9433
## F-statistic: 109.2 on 2 and 11 DF, p-value: 5.546e-08
confint(quad_mod,level=0.95)
## 2.5 % 97.5 %
## (Intercept) -3.329553e+02 7.394722e+02
## cc -7.367129e-01 1.529250e-01
## I(cc^2) 5.328126e-05 2.257552e-04
predict(quad_mod,interval="predict")
## Warning in predict.lm(quad_mod, interval = "predict"): predictions on current data refer to _future_ responses
## fit lwr upr
## 1 79.33345 -115.15333 273.8202
## 2 269.95364 83.77882 456.1285
## 3 823.64945 629.80153 1017.4974
## 4 236.35984 51.13324 421.5864
## 5 345.51232 157.77722 533.2474
## 6 93.39470 -95.99261 282.7820
## 7 68.06257 -133.64283 269.7680
## 8 152.32063 -31.28353 335.9248
## 9 479.77809 291.24523 668.3109
## 10 1186.94394 953.63989 1420.2480
## 11 104.89784 -82.01852 291.8142
## 12 152.32063 -31.28353 335.9248
## 13 432.23247 243.77631 620.6886
## 14 583.24042 394.65118 771.8297
# Adjusted R-Squared = 0.9433
#Multiple R -Squared Value = 0.9521
# Quadratic model
qd_model <- lm(wg~cc+CC_sq,data=cc.wg)
summary(qd_model)
##
## Call:
## lm(formula = wg ~ cc + CC_sq, data = cc.wg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -90.321 -63.843 -3.609 52.120 120.222
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.033e+02 2.436e+02 0.834 0.42185
## cc -2.919e-01 2.021e-01 -1.444 0.17653
## CC_sq 1.395e-04 3.918e-05 3.561 0.00447 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 79.43 on 11 degrees of freedom
## Multiple R-squared: 0.9521, Adjusted R-squared: 0.9433
## F-statistic: 109.2 on 2 and 11 DF, p-value: 5.546e-08
confint(quad_mod,level=0.95)
## 2.5 % 97.5 %
## (Intercept) -3.329553e+02 7.394722e+02
## cc -7.367129e-01 1.529250e-01
## I(cc^2) 5.328126e-05 2.257552e-04
predict(quad_mod,interval="predict")
## Warning in predict.lm(quad_mod, interval = "predict"): predictions on current data refer to _future_ responses
## fit lwr upr
## 1 79.33345 -115.15333 273.8202
## 2 269.95364 83.77882 456.1285
## 3 823.64945 629.80153 1017.4974
## 4 236.35984 51.13324 421.5864
## 5 345.51232 157.77722 533.2474
## 6 93.39470 -95.99261 282.7820
## 7 68.06257 -133.64283 269.7680
## 8 152.32063 -31.28353 335.9248
## 9 479.77809 291.24523 668.3109
## 10 1186.94394 953.63989 1420.2480
## 11 104.89784 -82.01852 291.8142
## 12 152.32063 -31.28353 335.9248
## 13 432.23247 243.77631 620.6886
## 14 583.24042 394.65118 771.8297
# Adjusted R-Squared = 0.9433
#Multiple R -Squared Value = 0.9521
# Cubic model
poly_mod <- lm(wg~cc+I(cc^2)+I(cc^3),data=cc.wg)
summary(poly_mod) # 0.9811
##
## Call:
## lm(formula = wg ~ cc + I(cc^2) + I(cc^3), data = cc.wg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -106.119 -25.806 -4.866 18.874 78.571
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.663e+03 6.464e+02 4.120 0.00208 **
## cc -3.451e+00 8.151e-01 -4.234 0.00173 **
## I(cc^2) 1.409e-03 3.243e-04 4.346 0.00145 **
## I(cc^3) -1.608e-07 4.093e-08 -3.928 0.00283 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 52.24 on 10 degrees of freedom
## Multiple R-squared: 0.9811, Adjusted R-squared: 0.9755
## F-statistic: 173.5 on 3 and 10 DF, p-value: 6.395e-09
confint(poly_mod,level=0.95)
## 2.5 % 97.5 %
## (Intercept) 1.223057e+03 4.103715e+03
## cc -5.267244e+00 -1.634872e+00
## I(cc^2) 6.868757e-04 2.132096e-03
## I(cc^3) -2.519688e-07 -6.958405e-08
predict(poly_mod,interval="predict")
## Warning in predict.lm(poly_mod, interval = "predict"): predictions on current data refer to _future_ responses
## fit lwr upr
## 1 115.52138 -15.58183 246.6246
## 2 225.96582 99.52724 352.4044
## 3 904.28877 767.36253 1041.2150
## 4 181.02235 53.76953 308.2752
## 5 332.89581 207.69983 458.0918
## 6 91.43641 -34.65979 217.5326
## 7 153.32619 10.59106 296.0613
## 8 91.85395 -35.10714 218.8150
## 9 521.42855 393.70236 649.1547
## 10 1105.44313 943.37986 1267.5064
## 11 82.22366 -42.88531 207.3326
## 12 91.85395 -35.10714 218.8150
## 13 456.11882 329.91810 582.3195
## 14 654.62121 522.69503 786.5474
# Adjusted R-Squared = 0.9755
#Multiple R -Squared Value = 0.9811
model_R_Squared_values <- list(model=NULL,R_squared=NULL)
model_R_Squared_values[["model"]] <- c("reg","reg_log","reg_exp","quad_mod","poly_mod")
model_R_Squared_values[["R_squared"]] <- c(0.8968,0.7917,0.8674,0.9433,0.9755)
Final <- cbind(model_R_Squared_values[["model"]],model_R_Squared_values[["R_squared"]])
View(model_R_Squared_values)
# Cubic model gives the best Adjusted R-Squared value
pred_final <- predict(poly_mod)
pred_final
## 1 2 3 4 5 6
## 115.52138 225.96582 904.28877 181.02235 332.89581 91.43641
## 7 8 9 10 11 12
## 153.32619 91.85395 521.42855 1105.44313 82.22366 91.85395
## 13 14
## 456.11882 654.62121
rmse<-sqrt(mean((pred_final-wg)^2))
rmse
## [1] 44.15011
plot(poly_mod)




hist(residuals(poly_mod)) # close to normal distribution
