library(data.table)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.2

Pedometer Data

Prediction of steps

ped <- fread("C:/Users/Gurpreet/Documents/IS605/ped.csv")
head(ped)
##    steps moderate min kcal mile  rain day daytype
## 1:  7537     1918  17  117  4.2 shine   T Weekday
## 2: 10008     5573  48  274  5.6  rain   W Weekday
## 3:  6912     3309  29  166  3.9  rain   R Weekday
## 4:  7552     3826  33  186  4.3  rain   F Weekday
## 5:  6789     2306  20  135  3.8  rain   W Weekday
## 6:  4219     1941  17   90  2.3  rain   M Weekday
names(ped) <- tolower(names(ped))
str(ped)
## Classes 'data.table' and 'data.frame':   68 obs. of  8 variables:
##  $ steps   : int  7537 10008 6912 7552 6789 4219 9260 5519 4465 8057 ...
##  $ moderate: int  1918 5573 3309 3826 2306 1941 6095 3183 1603 5242 ...
##  $ min     : int  17 48 29 33 20 17 54 27 14 47 ...
##  $ kcal    : int  117 274 166 186 135 90 280 154 82 237 ...
##  $ mile    : num  4.2 5.6 3.9 4.3 3.8 2.3 5.2 3.1 2.5 4.5 ...
##  $ rain    : chr  "shine" "rain" "rain" "rain" ...
##  $ day     : chr  "T" "W" "R" "F" ...
##  $ daytype : chr  "Weekday" "Weekday" "Weekday" "Weekday" ...
##  - attr(*, ".internal.selfref")=<externalptr>

Data Dictionary : Information about the dataset and columns included can be found here

Steps : Total number of steps for the day

Moderate : Number of steps at a moderate walking speed

Min : Number of minutes walking at a moderate speed

kcal : Number of calories burned walking at a moderate speed

Mile : Total number of miles walked

Rain : Type of weather (rain or shine)

Day : Day of the week (U=Sunday, M=Monday, T=Tuesday, W=Wednesday, R=Thursday, F=Friday, S=Saturday

DayType : Coded as Weekday or Weekend

ped<- transform(ped, daytype_d = ifelse(daytype == "Weekday",1,0), rain_d = ifelse(rain == "rain",1,0))

head(ped)
##    steps moderate min kcal mile  rain day daytype daytype_d rain_d
## 1:  7537     1918  17  117  4.2 shine   T Weekday         1      0
## 2: 10008     5573  48  274  5.6  rain   W Weekday         1      1
## 3:  6912     3309  29  166  3.9  rain   R Weekday         1      1
## 4:  7552     3826  33  186  4.3  rain   F Weekday         1      1
## 5:  6789     2306  20  135  3.8  rain   W Weekday         1      1
## 6:  4219     1941  17   90  2.3  rain   M Weekday         1      1
ped1 <- ped[,-c(6:8)]

pairs(ped1,gap=0.5)

model <- lm(steps~moderate+min+kcal+mile+daytype_d+rain_d, data=ped)
summary(model)
## 
## Call:
## lm(formula = steps ~ moderate + min + kcal + mile + daytype_d + 
##     rain_d, data = ped)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -184.27  -95.63  -24.60   39.02 1082.14 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  225.8648   124.6880   1.811   0.0750 .  
## moderate      -1.1331     0.6263  -1.809   0.0753 .  
## min           64.2252    51.3733   1.250   0.2160    
## kcal          15.4327     6.3223   2.441   0.0176 *  
## mile        1568.3661    99.1428  15.819   <2e-16 ***
## daytype_d    -42.0069    67.2927  -0.624   0.5348    
## rain_d        20.8195    55.5691   0.375   0.7092    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 184.9 on 61 degrees of freedom
## Multiple R-squared:  0.9921, Adjusted R-squared:  0.9913 
## F-statistic:  1280 on 6 and 61 DF,  p-value: < 2.2e-16
Back Substitution
model <-update(model, .~. -rain_d, data = ped)
summary(model)
## 
## Call:
## lm(formula = steps ~ moderate + min + kcal + mile + daytype_d, 
##     data = ped)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -186.79  -95.43  -20.98   31.68 1072.31 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  220.4153   122.9752   1.792   0.0780 .  
## moderate      -1.0569     0.5881  -1.797   0.0772 .  
## min           60.1507    49.8597   1.206   0.2322    
## kcal          14.3988     5.6487   2.549   0.0133 *  
## mile        1581.7814    91.8078  17.229   <2e-16 ***
## daytype_d    -41.5353    66.8129  -0.622   0.5364    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 183.6 on 62 degrees of freedom
## Multiple R-squared:  0.9921, Adjusted R-squared:  0.9915 
## F-statistic:  1557 on 5 and 62 DF,  p-value: < 2.2e-16
model <-update(model, .~. -daytype_d, data = ped)
summary(model)
## 
## Call:
## lm(formula = steps ~ moderate + min + kcal + mile, data = ped)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -193.96  -92.98  -24.12   34.53 1075.52 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  188.1056   110.9088   1.696   0.0948 .  
## moderate      -1.0040     0.5791  -1.734   0.0879 .  
## min           53.3324    48.4011   1.102   0.2747    
## kcal          14.4432     5.6207   2.570   0.0126 *  
## mile        1583.6265    91.3119  17.343   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 182.7 on 63 degrees of freedom
## Multiple R-squared:  0.9921, Adjusted R-squared:  0.9915 
## F-statistic:  1966 on 4 and 63 DF,  p-value: < 2.2e-16
model <-update(model, .~. -min, data = ped)
summary(model)
## 
## Call:
## lm(formula = steps ~ moderate + kcal + mile, data = ped)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -192.40  -94.81  -25.63   36.85 1084.49 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  143.0853   103.2811   1.385   0.1707    
## moderate      -0.3937     0.1695  -2.323   0.0234 *  
## kcal          10.4113     4.2736   2.436   0.0176 *  
## mile        1642.3936    74.2416  22.122   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 183 on 64 degrees of freedom
## Multiple R-squared:  0.9919, Adjusted R-squared:  0.9915 
## F-statistic:  2612 on 3 and 64 DF,  p-value: < 2.2e-16
steps_pred <- 143.0853 -0.3937*ped$moderate+10.4113*ped$kcal+1642.3936*ped$mile
ped$steps_p <- steps_pred


head(ped)
##    steps moderate min kcal mile  rain day daytype daytype_d rain_d
## 1:  7537     1918  17  117  4.2 shine   T Weekday         1      0
## 2: 10008     5573  48  274  5.6  rain   W Weekday         1      1
## 3:  6912     3309  29  166  3.9  rain   R Weekday         1      1
## 4:  7552     3826  33  186  4.3  rain   F Weekday         1      1
## 5:  6789     2306  20  135  3.8  rain   W Weekday         1      1
## 6:  4219     1941  17   90  2.3  rain   M Weekday         1      1
##     steps_p
## 1: 7504.144
## 2: 9999.096
## 3: 6973.943
## 4: 7635.583
## 5: 6881.834
## 6: 4093.436
ggplot(model, aes(x = .fitted, y = .resid)) + geom_point() +geom_hline(yintercept=0)+
  expand_limits(y = c(0, -800))

qqnorm(model$residuals)
qqline(model$residuals)

Conclusion:

Residual plots show clustering around the line y=0, but there does not seems a strict pattern.The residual plot does not assist in deciding the fit of model. A closer look at qqplot reveals the normality with only a few outliers in the end.Considering F-value and p-value from the model, the possibility of model not fitting data is low. I can assume considering the later factors that model is appropriate.