library(LearnBayes)
data(studentdata)
attach(studentdata)
head(studentdata)
## Student Height Gender Shoes Number Dvds ToSleep WakeUp Haircut Job Drink
## 1 1 67 female 10 5 10 -2.5 5.5 60 30.0 water
## 2 2 64 female 20 7 5 1.5 8.0 0 20.0 pop
## 3 3 61 female 12 2 6 -1.5 7.5 48 0.0 milk
## 4 4 61 female 3 6 40 2.0 8.5 10 0.0 water
## 5 5 70 male 4 5 6 0.0 9.0 15 17.5 pop
## 6 6 63 female NA 3 5 1.0 8.5 25 0.0 water
hist(studentdata$Dvds, prob=T)

summary(studentdata)
## Student Height Gender Shoes Number
## Min. : 1 Min. :54.0 female:435 Min. : 0.00 Min. : 1.00
## 1st Qu.:165 1st Qu.:64.0 male :222 1st Qu.: 6.00 1st Qu.: 4.00
## Median :329 Median :66.0 Median : 12.00 Median : 6.00
## Mean :329 Mean :66.7 Mean : 15.42 Mean : 5.67
## 3rd Qu.:493 3rd Qu.:70.0 3rd Qu.: 20.00 3rd Qu.: 7.00
## Max. :657 Max. :84.0 Max. :164.00 Max. :10.00
## NA's :10 NA's :22 NA's :2
## Dvds ToSleep WakeUp Haircut
## Min. : 0.00 Min. :-2.500 Min. : 1.000 Min. : 0.00
## 1st Qu.: 10.00 1st Qu.: 0.000 1st Qu.: 7.500 1st Qu.: 10.00
## Median : 20.00 Median : 1.000 Median : 8.500 Median : 16.00
## Mean : 30.93 Mean : 1.001 Mean : 8.383 Mean : 25.91
## 3rd Qu.: 30.00 3rd Qu.: 2.000 3rd Qu.: 9.000 3rd Qu.: 30.00
## Max. :1000.00 Max. : 6.000 Max. :13.000 Max. :180.00
## NA's :16 NA's :3 NA's :2 NA's :20
## Job Drink
## Min. : 0.00 milk :113
## 1st Qu.: 0.00 pop :178
## Median :10.50 water:355
## Mean :11.45 NA's : 11
## 3rd Qu.:17.50
## Max. :80.00
## NA's :32
barplot(table(Dvds),col='red')

# Popular values of 10 and 20 perhaps can perhaps indicate that there are 10 popular movies that most people enjoy?
# If someone owns more than 10 there might be 15 (indicated by the small frequency spike) or 20 movies that are far more popular resulting in people owning only those
#
# Most likely though, it could indicate that most people don't know the exact number of dvds they own resulting in them rounding their guess to the nearest 5 or 10.
# This makes the most sense as 10-20 dvds is what most people would have. After that there are spikes at every interval of 5.
boxplot(Height~Gender)

output=boxplot(Height~Gender)

print(output)
## $stats
## [,1] [,2]
## [1,] 57.75 65
## [2,] 63.00 69
## [3,] 64.50 71
## [4,] 67.00 72
## [5,] 73.00 76
##
## $n
## [1] 428 219
##
## $conf
## [,1] [,2]
## [1,] 64.19451 70.6797
## [2,] 64.80549 71.3203
##
## $out
## [1] 56 76 55 56 76 54 54 84 78 77 56 63 77 79 62 62 61 79 59 61 78 62
##
## $group
## [1] 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
##
## $names
## [1] "female" "male"
group_means <- aggregate(Height~Gender, data = studentdata, FUN = mean)
print(group_means)
## Gender Height
## 1 female 64.75701
## 2 male 70.50767
mean_diff <- group_means[2,2] - group_means[1,2]
print(mean_diff)
## [1] 5.750657
# On average male students are 5.750657 inches taller than female students
plot(ToSleep, WakeUp)
fit = lm(WakeUp~ToSleep)
summary(fit)
##
## Call:
## lm(formula = WakeUp ~ ToSleep)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.4010 -0.9628 -0.0998 0.8249 4.6125
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.96276 0.06180 128.85 <2e-16 ***
## ToSleep 0.42472 0.03595 11.81 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.282 on 651 degrees of freedom
## (4 observations deleted due to missingness)
## Multiple R-squared: 0.1765, Adjusted R-squared: 0.1753
## F-statistic: 139.5 on 1 and 651 DF, p-value: < 2.2e-16
abline(fit, col='blue', lwd=2)
