cc.wg <- read.csv(file.choose()) # choose the Calories_Consumed.csv data set
View(cc.wg)

# 14 Observations of 2 variables

# Scatter Diagram (Plot x,y)
plot(cc.wg$Calories.Consumed,cc.wg$Weight.gained..grams.)

# Other Exploratory data analysis and Plots

boxplot(cc.wg)

hist(cc.wg$Calories.Consumed)

hist(cc.wg$Weight.gained..grams.)

summary(cc.wg)

##  Weight.gained..grams. Calories.Consumed
##  Min.   :  62.0        Min.   :1400     
##  1st Qu.: 114.5        1st Qu.:1728     
##  Median : 200.0        Median :2250     
##  Mean   : 357.7        Mean   :2341     
##  3rd Qu.: 537.5        3rd Qu.:2775     
##  Max.   :1100.0        Max.   :3900

# Correlation coefficient value for Calories Consumes and Weight Gained
cc<- cc.wg$Calories.Consumed
wg <- cc.wg$Weight.gained..grams.
cor(wg,cc)

## [1] 0.946991

# If |r| is greater than  0.85 then Co-relation is Strong(Correlation Co-efficient = 0.946991). 
# This has a Strong Co-relation 

# Simple model without using any transformation
reg<-lm(wg~cc)
summary(reg)

## 
## Call:
## lm(formula = wg ~ cc)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -158.67 -107.56   36.70   81.68  165.53 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -625.75236  100.82293  -6.206 4.54e-05 ***
## cc             0.42016    0.04115  10.211 2.86e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 111.6 on 12 degrees of freedom
## Multiple R-squared:  0.8968, Adjusted R-squared:  0.8882 
## F-statistic: 104.3 on 1 and 12 DF,  p-value: 2.856e-07

# Probability value should be less than 0.05(4.54e-05)
# The multiple-R-Squared Value is 0.8968 which is greater 0.8(In General)
# The Probability Value for F-Statistic is 2.85e-07(Overall Probability Model is also less than 0.05)
confint(reg,level = 0.95) # confidence interval

##                    2.5 %       97.5 %
## (Intercept) -845.4266546 -406.0780569
## cc             0.3305064    0.5098069

# The above code will get you 2 equations 
# 1 to caliculate the lower range and other for upper range

# Function to Predict the above model 
predict(reg,interval="predict")

## Warning in predict.lm(reg, interval = "predict"): predictions on current data refer to _future_ responses

##            fit        lwr       upr
## 1     4.482599 -258.20569  267.1709
## 2   340.607908   88.93791  592.2779
## 3   802.780209  533.81393 1071.7465
## 4   298.592245   46.63271  550.5518
## 5   424.639236  172.59086  676.6876
## 6    46.498263 -213.75953  306.7561
## 7   -37.533065 -302.93258  227.8664
## 8   172.545254  -82.18110  427.2716
## 9   550.686227  295.69632  805.6761
## 10 1012.858527  724.99432 1300.7227
## 11   75.909227 -182.81852  334.6370
## 12  172.545254  -82.18110  427.2716
## 13  508.670563  254.97398  762.3671
## 14  634.717554  376.22600  893.2091

# predict(reg,type="prediction")
# R-squared value for the above model is 0.8968 

# we may have to do transformation of variables for better R-squared value
# Applying transformations

# Logarthmic transformation
reg_log<-lm(wg~log(cc))  # Regression using logarthmic transformation
summary(reg_log)

## 
## Call:
## lm(formula = wg ~ log(cc))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -187.44 -142.96   23.13  113.20  213.82 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -6955.7     1030.9  -6.747 2.05e-05 ***
## log(cc)        948.4      133.6   7.100 1.25e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 152.3 on 12 degrees of freedom
## Multiple R-squared:  0.8077, Adjusted R-squared:  0.7917 
## F-statistic:  50.4 on 1 and 12 DF,  p-value: 1.248e-05

confint(reg_log,level=0.95)

##                  2.5 %    97.5 %
## (Intercept) -9201.8063 -4709.494
## log(cc)       657.3251  1239.418

predict(reg_log,interval="predict")

## Warning in predict.lm(reg_log, interval = "predict"): predictions on current data refer to _future_ responses

##          fit          lwr       upr
## 1  -19.99870 -382.5178898  342.5205
## 2  385.37711   41.7849717  728.9693
## 3  756.06367  391.4700627 1120.6573
## 4  343.22032   -0.2957275  686.7364
## 5  464.45388  119.4081720  809.4996
## 6   41.20781 -315.7491910  398.1648
## 7  -85.42959 -454.8597180  284.0005
## 8  204.18573 -142.5179686  550.8894
## 9  571.93160  222.2096884  921.6535
## 10 886.18133  506.3308457 1266.0318
## 11  81.81708 -271.9519877  435.5861
## 12 204.18573 -142.5179686  550.8894
## 13 537.44155  189.5540023  885.3291
## 14 637.36248  283.3161385  991.4088

# R-squared value for the above model is 0.8077
# Adjusted R-squared:  0.7917 

# we may have to do different transformation better R-squared value
# Applying different transformations

# Exponential model 
reg_exp<-lm(log(wg)~cc) # regression using Exponential model
summary(reg_exp)

## 
## Call:
## lm(formula = log(wg) ~ cc)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.86537 -0.10532  0.02462  0.13467  0.42632 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2.8386724  0.2994581   9.479 6.36e-07 ***
## cc          0.0011336  0.0001222   9.276 8.02e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3314 on 12 degrees of freedom
## Multiple R-squared:  0.8776, Adjusted R-squared:  0.8674 
## F-statistic: 86.04 on 1 and 12 DF,  p-value: 8.018e-07

confint(reg_exp,level=0.95)

##                    2.5 %      97.5 %
## (Intercept) 2.1862091856 3.491135698
## cc          0.0008673238 0.001399871

exp(predict(reg_exp,interval="predict"))

## Warning in predict.lm(reg_exp, interval = "predict"): predictions on current data refer to _future_ responses

##           fit       lwr       upr
## 1    93.60358  42.89897  204.2387
## 2   231.81660 109.77706  489.5279
## 3   806.66119 362.86703 1793.2251
## 4   206.97268  97.92794  437.4409
## 5   290.80881 137.55822  614.7925
## 6   104.83926  48.39645  227.1090
## 7    83.57203  37.99427  183.8247
## 8   147.30534  69.12627  313.9018
## 9   408.60351 191.59614  871.3997
## 10 1421.83342 604.68380 3343.2519
## 11  113.49743  52.63192  244.7501
## 12  147.30534  69.12627  313.9018
## 13  364.81323 171.72105  775.0284
## 14  512.58408 237.86633 1104.5802

# R-squared value has increased from 0.8776
# Adjusted R SQuare Value - 0.8674 
# Higher the R-sqaured value - Better chances of getting good model 
# for Calories Consumed and Weight Gained.

# Quadratic model
cc.wg[,"CC_sq"] = cc*cc

# Quadratic model
quad_mod <- lm(wg~cc+I(cc^2),data=cc.wg)
summary(quad_mod)

## 
## Call:
## lm(formula = wg ~ cc + I(cc^2), data = cc.wg)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -90.321 -63.843  -3.609  52.120 120.222 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  2.033e+02  2.436e+02   0.834  0.42185   
## cc          -2.919e-01  2.021e-01  -1.444  0.17653   
## I(cc^2)      1.395e-04  3.918e-05   3.561  0.00447 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 79.43 on 11 degrees of freedom
## Multiple R-squared:  0.9521, Adjusted R-squared:  0.9433 
## F-statistic: 109.2 on 2 and 11 DF,  p-value: 5.546e-08

confint(quad_mod,level=0.95)

##                     2.5 %       97.5 %
## (Intercept) -3.329553e+02 7.394722e+02
## cc          -7.367129e-01 1.529250e-01
## I(cc^2)      5.328126e-05 2.257552e-04

predict(quad_mod,interval="predict")

## Warning in predict.lm(quad_mod, interval = "predict"): predictions on current data refer to _future_ responses

##           fit        lwr       upr
## 1    79.33345 -115.15333  273.8202
## 2   269.95364   83.77882  456.1285
## 3   823.64945  629.80153 1017.4974
## 4   236.35984   51.13324  421.5864
## 5   345.51232  157.77722  533.2474
## 6    93.39470  -95.99261  282.7820
## 7    68.06257 -133.64283  269.7680
## 8   152.32063  -31.28353  335.9248
## 9   479.77809  291.24523  668.3109
## 10 1186.94394  953.63989 1420.2480
## 11  104.89784  -82.01852  291.8142
## 12  152.32063  -31.28353  335.9248
## 13  432.23247  243.77631  620.6886
## 14  583.24042  394.65118  771.8297

# Adjusted R-Squared = 0.9433
#Multiple R -Squared Value = 0.9521

# Quadratic model
qd_model <- lm(wg~cc+CC_sq,data=cc.wg)
summary(qd_model)

## 
## Call:
## lm(formula = wg ~ cc + CC_sq, data = cc.wg)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -90.321 -63.843  -3.609  52.120 120.222 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  2.033e+02  2.436e+02   0.834  0.42185   
## cc          -2.919e-01  2.021e-01  -1.444  0.17653   
## CC_sq        1.395e-04  3.918e-05   3.561  0.00447 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 79.43 on 11 degrees of freedom
## Multiple R-squared:  0.9521, Adjusted R-squared:  0.9433 
## F-statistic: 109.2 on 2 and 11 DF,  p-value: 5.546e-08

confint(quad_mod,level=0.95)

##                     2.5 %       97.5 %
## (Intercept) -3.329553e+02 7.394722e+02
## cc          -7.367129e-01 1.529250e-01
## I(cc^2)      5.328126e-05 2.257552e-04

predict(quad_mod,interval="predict")

## Warning in predict.lm(quad_mod, interval = "predict"): predictions on current data refer to _future_ responses

##           fit        lwr       upr
## 1    79.33345 -115.15333  273.8202
## 2   269.95364   83.77882  456.1285
## 3   823.64945  629.80153 1017.4974
## 4   236.35984   51.13324  421.5864
## 5   345.51232  157.77722  533.2474
## 6    93.39470  -95.99261  282.7820
## 7    68.06257 -133.64283  269.7680
## 8   152.32063  -31.28353  335.9248
## 9   479.77809  291.24523  668.3109
## 10 1186.94394  953.63989 1420.2480
## 11  104.89784  -82.01852  291.8142
## 12  152.32063  -31.28353  335.9248
## 13  432.23247  243.77631  620.6886
## 14  583.24042  394.65118  771.8297

# Adjusted R-Squared = 0.9433
#Multiple R -Squared Value = 0.9521

# Cubic model
poly_mod <- lm(wg~cc+I(cc^2)+I(cc^3),data=cc.wg)
summary(poly_mod) # 0.9811

## 
## Call:
## lm(formula = wg ~ cc + I(cc^2) + I(cc^3), data = cc.wg)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -106.119  -25.806   -4.866   18.874   78.571 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  2.663e+03  6.464e+02   4.120  0.00208 **
## cc          -3.451e+00  8.151e-01  -4.234  0.00173 **
## I(cc^2)      1.409e-03  3.243e-04   4.346  0.00145 **
## I(cc^3)     -1.608e-07  4.093e-08  -3.928  0.00283 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 52.24 on 10 degrees of freedom
## Multiple R-squared:  0.9811, Adjusted R-squared:  0.9755 
## F-statistic: 173.5 on 3 and 10 DF,  p-value: 6.395e-09

confint(poly_mod,level=0.95)

##                     2.5 %        97.5 %
## (Intercept)  1.223057e+03  4.103715e+03
## cc          -5.267244e+00 -1.634872e+00
## I(cc^2)      6.868757e-04  2.132096e-03
## I(cc^3)     -2.519688e-07 -6.958405e-08

predict(poly_mod,interval="predict")

## Warning in predict.lm(poly_mod, interval = "predict"): predictions on current data refer to _future_ responses

##           fit       lwr       upr
## 1   115.52138 -15.58183  246.6246
## 2   225.96582  99.52724  352.4044
## 3   904.28877 767.36253 1041.2150
## 4   181.02235  53.76953  308.2752
## 5   332.89581 207.69983  458.0918
## 6    91.43641 -34.65979  217.5326
## 7   153.32619  10.59106  296.0613
## 8    91.85395 -35.10714  218.8150
## 9   521.42855 393.70236  649.1547
## 10 1105.44313 943.37986 1267.5064
## 11   82.22366 -42.88531  207.3326
## 12   91.85395 -35.10714  218.8150
## 13  456.11882 329.91810  582.3195
## 14  654.62121 522.69503  786.5474

# Adjusted R-Squared = 0.9755
#Multiple R -Squared Value = 0.9811

model_R_Squared_values <- list(model=NULL,R_squared=NULL)
model_R_Squared_values[["model"]] <- c("reg","reg_log","reg_exp","quad_mod","poly_mod")
model_R_Squared_values[["R_squared"]] <- c(0.8968,0.7917,0.8674,0.9433,0.9755)
Final <- cbind(model_R_Squared_values[["model"]],model_R_Squared_values[["R_squared"]])
View(model_R_Squared_values)

# Cubic  model gives the best Adjusted R-Squared value
pred_final <- predict(poly_mod)
pred_final

##          1          2          3          4          5          6 
##  115.52138  225.96582  904.28877  181.02235  332.89581   91.43641 
##          7          8          9         10         11         12 
##  153.32619   91.85395  521.42855 1105.44313   82.22366   91.85395 
##         13         14 
##  456.11882  654.62121

rmse<-sqrt(mean((pred_final-wg)^2))
rmse

## [1] 44.15011

plot(poly_mod)

hist(residuals(poly_mod)) # close to normal distribution