lung.capacity <- read.csv("C:\\Users\\pranit\\Documents\\Imarticus\\LungCapData.csv")
head(lung.capacity)
## LungCap Age Height Smoke Gender Caesarean
## 1 6.475 6 62.1 no male no
## 2 10.125 18 74.7 yes female no
## 3 9.550 16 69.7 no female yes
## 4 11.125 14 71.0 no male no
## 5 4.800 5 56.9 no male no
## 6 6.225 11 58.7 no female no
library(ggplot2)
ggplot(data = lung.capacity,aes(x = LungCap)) + geom_histogram(color = "red")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(lung.capacity, aes(x= LungCap, y = Age )) + geom_boxplot()
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
ggplot(lung.capacity, aes(y= LungCap, x = Age )) + geom_boxplot()
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
ggplot(lung.capacity, aes(x = Gender)) + geom_bar()
prop.table(table(lung.capacity$Gender))
##
## female male
## 0.4937931 0.5062069
ggplot(lung.capacity, aes(x= Gender, y =LungCap)) + geom_boxplot()
## Compare and contrast the gender variable against smoke variable using bar plot
ggplot(lung.capacity,aes(x = Gender,fill = factor(Smoke))) +
geom_bar(position = position_dodge(1))
library(caret)
## Loading required package: lattice
library(lattice)
linear <- lm(data = lung.capacity)
z <- summary(linear)
predictions <- predict(linear, lung.capacity[-1])
RMSE(predictions, lung.capacity$LungCap)
## [1] 1.015587
par(mfrow = c(2, 2))
plot(linear)
reducelm <- lm(data = lung.capacity,lung.capacity$LungCap ~ Age + Height +
Smoke + Gender)
predictions <- predict(reducelm, lung.capacity[-1])
RMSE(predictions, lung.capacity$LungCap)
## [1] 1.019516
summary(reducelm)
##
## Call:
## lm(formula = lung.capacity$LungCap ~ Age + Height + Smoke + Gender,
## data = lung.capacity)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.2915 -0.7360 0.0184 0.7125 3.0599
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -11.33282 0.47245 -23.987 < 2e-16 ***
## Age 0.16012 0.01806 8.864 < 2e-16 ***
## Height 0.26363 0.01009 26.123 < 2e-16 ***
## Smokeyes -0.61774 0.12633 -4.890 1.24e-06 ***
## Gendermale 0.38528 0.07991 4.822 1.74e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.023 on 720 degrees of freedom
## Multiple R-squared: 0.8531, Adjusted R-squared: 0.8523
## F-statistic: 1045 on 4 and 720 DF, p-value: < 2.2e-16
anv <- anova(linear,reducelm,test = "F")
## Warning in anova.lmlist(object, ...): models with response '"lung.capacity
## $LungCap"' removed because response differs from model 1
summary(anv)
## Df Sum Sq Mean Sq F value
## Min. : 1.0 Min. : 5.797 Min. : 1.04 Min. : 5.574
## 1st Qu.: 1.0 1st Qu.: 25.164 1st Qu.: 10.43 1st Qu.: 23.396
## Median : 1.0 Median : 387.718 Median : 26.00 Median : 26.595
## Mean :120.7 Mean : 855.079 Mean : 730.62 Mean : 842.806
## 3rd Qu.: 1.0 3rd Qu.: 845.378 3rd Qu.: 665.35 3rd Qu.: 844.125
## Max. :719.0 Max. :3446.995 Max. :3446.99 Max. :3314.339
## NA's :1
## Pr(>F)
## Min. :0.00e+00
## 1st Qu.:0.00e+00
## Median :3.00e-07
## Mean :3.70e-03
## 3rd Qu.:1.60e-06
## Max. :1.85e-02
## NA's :1
anv
## Analysis of Variance Table
##
## Response: LungCap
## Df Sum Sq Mean Sq F value Pr(>F)
## Age 1 3447.0 3447.0 3314.3391 < 2.2e-16 ***
## Height 1 877.9 877.9 844.1254 < 2.2e-16 ***
## Smoke 1 27.7 27.7 26.5945 3.248e-07 ***
## Gender 1 24.3 24.3 23.3963 1.614e-06 ***
## Caesarean 1 5.8 5.8 5.5737 0.0185 *
## Residuals 719 747.8 1.0
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anv$`F value`
## [1] 3314.339117 844.125403 26.594508 23.396293 5.573723 NA