library(data.table)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.2
Prediction of steps
ped <- fread("C:/Users/Gurpreet/Documents/IS605/ped.csv")
head(ped)
## steps moderate min kcal mile rain day daytype
## 1: 7537 1918 17 117 4.2 shine T Weekday
## 2: 10008 5573 48 274 5.6 rain W Weekday
## 3: 6912 3309 29 166 3.9 rain R Weekday
## 4: 7552 3826 33 186 4.3 rain F Weekday
## 5: 6789 2306 20 135 3.8 rain W Weekday
## 6: 4219 1941 17 90 2.3 rain M Weekday
names(ped) <- tolower(names(ped))
str(ped)
## Classes 'data.table' and 'data.frame': 68 obs. of 8 variables:
## $ steps : int 7537 10008 6912 7552 6789 4219 9260 5519 4465 8057 ...
## $ moderate: int 1918 5573 3309 3826 2306 1941 6095 3183 1603 5242 ...
## $ min : int 17 48 29 33 20 17 54 27 14 47 ...
## $ kcal : int 117 274 166 186 135 90 280 154 82 237 ...
## $ mile : num 4.2 5.6 3.9 4.3 3.8 2.3 5.2 3.1 2.5 4.5 ...
## $ rain : chr "shine" "rain" "rain" "rain" ...
## $ day : chr "T" "W" "R" "F" ...
## $ daytype : chr "Weekday" "Weekday" "Weekday" "Weekday" ...
## - attr(*, ".internal.selfref")=<externalptr>
Data Dictionary : Information about the dataset and columns included can be found here
Steps : Total number of steps for the day
Moderate : Number of steps at a moderate walking speed
Min : Number of minutes walking at a moderate speed
kcal : Number of calories burned walking at a moderate speed
Mile : Total number of miles walked
Rain : Type of weather (rain or shine)
Day : Day of the week (U=Sunday, M=Monday, T=Tuesday, W=Wednesday, R=Thursday, F=Friday, S=Saturday
DayType : Coded as Weekday or Weekend
ped<- transform(ped, daytype_d = ifelse(daytype == "Weekday",1,0), rain_d = ifelse(rain == "rain",1,0))
head(ped)
## steps moderate min kcal mile rain day daytype daytype_d rain_d
## 1: 7537 1918 17 117 4.2 shine T Weekday 1 0
## 2: 10008 5573 48 274 5.6 rain W Weekday 1 1
## 3: 6912 3309 29 166 3.9 rain R Weekday 1 1
## 4: 7552 3826 33 186 4.3 rain F Weekday 1 1
## 5: 6789 2306 20 135 3.8 rain W Weekday 1 1
## 6: 4219 1941 17 90 2.3 rain M Weekday 1 1
ped1 <- ped[,-c(6:8)]
pairs(ped1,gap=0.5)
model <- lm(steps~moderate+min+kcal+mile+daytype_d+rain_d, data=ped)
summary(model)
##
## Call:
## lm(formula = steps ~ moderate + min + kcal + mile + daytype_d +
## rain_d, data = ped)
##
## Residuals:
## Min 1Q Median 3Q Max
## -184.27 -95.63 -24.60 39.02 1082.14
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 225.8648 124.6880 1.811 0.0750 .
## moderate -1.1331 0.6263 -1.809 0.0753 .
## min 64.2252 51.3733 1.250 0.2160
## kcal 15.4327 6.3223 2.441 0.0176 *
## mile 1568.3661 99.1428 15.819 <2e-16 ***
## daytype_d -42.0069 67.2927 -0.624 0.5348
## rain_d 20.8195 55.5691 0.375 0.7092
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 184.9 on 61 degrees of freedom
## Multiple R-squared: 0.9921, Adjusted R-squared: 0.9913
## F-statistic: 1280 on 6 and 61 DF, p-value: < 2.2e-16
model <-update(model, .~. -rain_d, data = ped)
summary(model)
##
## Call:
## lm(formula = steps ~ moderate + min + kcal + mile + daytype_d,
## data = ped)
##
## Residuals:
## Min 1Q Median 3Q Max
## -186.79 -95.43 -20.98 31.68 1072.31
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 220.4153 122.9752 1.792 0.0780 .
## moderate -1.0569 0.5881 -1.797 0.0772 .
## min 60.1507 49.8597 1.206 0.2322
## kcal 14.3988 5.6487 2.549 0.0133 *
## mile 1581.7814 91.8078 17.229 <2e-16 ***
## daytype_d -41.5353 66.8129 -0.622 0.5364
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 183.6 on 62 degrees of freedom
## Multiple R-squared: 0.9921, Adjusted R-squared: 0.9915
## F-statistic: 1557 on 5 and 62 DF, p-value: < 2.2e-16
model <-update(model, .~. -daytype_d, data = ped)
summary(model)
##
## Call:
## lm(formula = steps ~ moderate + min + kcal + mile, data = ped)
##
## Residuals:
## Min 1Q Median 3Q Max
## -193.96 -92.98 -24.12 34.53 1075.52
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 188.1056 110.9088 1.696 0.0948 .
## moderate -1.0040 0.5791 -1.734 0.0879 .
## min 53.3324 48.4011 1.102 0.2747
## kcal 14.4432 5.6207 2.570 0.0126 *
## mile 1583.6265 91.3119 17.343 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 182.7 on 63 degrees of freedom
## Multiple R-squared: 0.9921, Adjusted R-squared: 0.9915
## F-statistic: 1966 on 4 and 63 DF, p-value: < 2.2e-16
model <-update(model, .~. -min, data = ped)
summary(model)
##
## Call:
## lm(formula = steps ~ moderate + kcal + mile, data = ped)
##
## Residuals:
## Min 1Q Median 3Q Max
## -192.40 -94.81 -25.63 36.85 1084.49
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 143.0853 103.2811 1.385 0.1707
## moderate -0.3937 0.1695 -2.323 0.0234 *
## kcal 10.4113 4.2736 2.436 0.0176 *
## mile 1642.3936 74.2416 22.122 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 183 on 64 degrees of freedom
## Multiple R-squared: 0.9919, Adjusted R-squared: 0.9915
## F-statistic: 2612 on 3 and 64 DF, p-value: < 2.2e-16
steps_pred <- 143.0853 -0.3937*ped$moderate+10.4113*ped$kcal+1642.3936*ped$mile
ped$steps_p <- steps_pred
head(ped)
## steps moderate min kcal mile rain day daytype daytype_d rain_d
## 1: 7537 1918 17 117 4.2 shine T Weekday 1 0
## 2: 10008 5573 48 274 5.6 rain W Weekday 1 1
## 3: 6912 3309 29 166 3.9 rain R Weekday 1 1
## 4: 7552 3826 33 186 4.3 rain F Weekday 1 1
## 5: 6789 2306 20 135 3.8 rain W Weekday 1 1
## 6: 4219 1941 17 90 2.3 rain M Weekday 1 1
## steps_p
## 1: 7504.144
## 2: 9999.096
## 3: 6973.943
## 4: 7635.583
## 5: 6881.834
## 6: 4093.436
ggplot(model, aes(x = .fitted, y = .resid)) + geom_point() +geom_hline(yintercept=0)+
expand_limits(y = c(0, -800))
qqnorm(model$residuals)
qqline(model$residuals)
Residual plots show clustering around the line y=0, but there does not seems a strict pattern.The residual plot does not assist in deciding the fit of model. A closer look at qqplot reveals the normality with only a few outliers in the end.Considering F-value and p-value from the model, the possibility of model not fitting data is low. I can assume considering the later factors that model is appropriate.