load(url("http://s3.amazonaws.com/assets.datacamp.com/course/dasi/nc.Rdata"))
names(nc)
## [1] "fage" "mage" "mature" "weeks"
## [5] "premie" "visits" "marital" "gained"
## [9] "weight" "lowbirthweight" "gender" "habit"
## [13] "whitemom"
# structure(nc) gives cases
str(nc)
## 'data.frame': 1000 obs. of 13 variables:
## $ fage : int NA NA 19 21 NA NA 18 17 NA 20 ...
## $ mage : int 13 14 15 15 15 15 15 15 16 16 ...
## $ mature : Factor w/ 2 levels "mature mom","younger mom": 2 2 2 2 2 2 2 2 2 2 ...
## $ weeks : int 39 42 37 41 39 38 37 35 38 37 ...
## $ premie : Factor w/ 2 levels "full term","premie": 1 1 1 1 1 1 1 2 1 1 ...
## $ visits : int 10 15 11 6 9 19 12 5 9 13 ...
## $ marital : Factor w/ 2 levels "married","not married": 1 1 1 1 1 1 1 1 1 1 ...
## $ gained : int 38 20 38 34 27 22 76 15 NA 52 ...
## $ weight : num 7.63 7.88 6.63 8 6.38 5.38 8.44 4.69 8.81 6.94 ...
## $ lowbirthweight: Factor w/ 2 levels "low","not low": 2 2 2 2 2 1 2 1 2 2 ...
## $ gender : Factor w/ 2 levels "female","male": 2 2 1 2 1 2 2 2 2 1 ...
## $ habit : Factor w/ 2 levels "nonsmoker","smoker": 1 1 1 1 1 1 1 1 1 1 ...
## $ whitemom : Factor w/ 2 levels "not white","white": 1 1 2 2 1 1 1 1 2 2 ...
summary(nc$gained)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0 20.0 30.0 30.3 38.0 85.0 27
hist(nc$gained)
boxplot(nc$gained)
Now get the cleaned data for the variable gained.
cleaned_gained <- na.omit(nc$gained)
n = length(cleaned_gained)
Bootstrap samples are samples with replacement.
boot_means = rep(NA, 100)
for (i in 1:100) {
samp = sample(cleaned_gained, n, replace = TRUE)
boot_means[i] <- mean(samp)
}
# print(boot_means)
hist(boot_means)
Now load the inference function from the Data Camp assets
load(url("http://s3.amazonaws.com/assets.datacamp.com/course/dasi/inference.Rdata"))
Here is the code for the inference function: inference(nc$gained, type = “ci”, method = “simulation”, conflevel = 0.9, est = “mean”, boot_method = “perc”) I don't know what the 'boot_method = “perc” means.
inference(nc$gained, type = "ci", method = "simulation", conflevel = 0.9, est = "mean",
boot_method = "perc")
## Single mean
## Summary statistics:
## mean = 30.3258 ; sd = 14.2413 ; n = 973
## Bootstrap method: Percentile
## 90 % Bootstrap interval = ( 29.5642 , 31.0763 )
Now for a 95% confidence interval
inference(nc$gained, type = "ci", method = "simulation", conflevel = 0.95, est = "mean",
boot_method = "perc")
## Single mean
## Summary statistics:
## mean = 30.3258 ; sd = 14.2413 ; n = 973
## Bootstrap method: Percentile
## 95 % Bootstrap interval = ( 29.4425 , 31.2415 )
Notice how the interval gets larger. Now try it with the standard error method.
inference(nc$gained, type = "ci", method = "simulation", conflevel = 0.9, est = "mean",
boot_method = "se")
## Single mean
## Summary statistics:
## mean = 30.3258 ; sd = 14.2413 ; n = 973
## Bootstrap method: Standard error; Boot. mean = 30.3455 ; Boot. SE = 0.4457
## 90 % Bootstrap interval = ( 29.6124 , 31.0786 )
Now estimate the confidence interval around the median instead of the mean.
inference(nc$gained, type = "ci", method = "simulation", conflevel = 0.9, est = "median",
boot_method = "perc")
## Single median
## Summary statistics:
## median = 30 ; n = 973
## Bootstrap method: Percentile
## 90 % Bootstrap interval = ( 29 , 30 )
Notice that this time it returns integers because the median itself is an integer. Father's Age: fage Use the inference function to create a 95% bootstrap CI.
inference(nc$fage, type = "ci", method = "simulation", conflevel = 0.95, est = "mean",
boot_method = "se")
## Single mean
## Summary statistics:
## mean = 30.2557 ; sd = 6.7638 ; n = 829
## Bootstrap method: Standard error; Boot. mean = 30.2534 ; Boot. SE = 0.2416
## 95 % Bootstrap interval = ( 29.7798 , 30.7269 )
Relationship between two variables: categorical versus numerical.
plot(nc$habit, nc$weight) #default for plot assigns the first variable to the x axis.
by function–compares means of two distributions by spliting the catagorical variable by its levels and looking at the mean of the interval level variable (i.e, the numerica variable) for each category.
by(nc$weight, nc$habit, mean)
## nc$habit: nonsmoker
## [1] 7.144
## --------------------------------------------------------
## nc$habit: smoker
## [1] 6.829
# this takes the second variable as the x axis, acting like a 'function of'
# type procedure.
Conditions for Inference======================================
by(nc$weight, nc$habit, length)
## nc$habit: nonsmoker
## [1] 873
## --------------------------------------------------------
## nc$habit: smoker
## [1] 126
So there are 873 cases where the mother was a non-smoker and 126 cases where the mother smoked. More inference================================================ Arguments for Inference function 1 response variable 2 grouping variable 3 the parameter we are interested in 4 type of inference: hypothesis or CI 5 null hypothesis 6 alternative hypothesis 7 method: theoretical or simulation 8 order, which gets subtracted from which–optional
inference(y = nc$weight, x = nc$habit, est = "mean", type = "ht", null = 0,
alternative = "twosided", method = "theoretical")
## Response variable: numerical, Explanatory variable: categorical
## Difference between two means
## Summary statistics:
## n_nonsmoker = 873, mean_nonsmoker = 7.144, sd_nonsmoker = 1.519
## n_smoker = 126, mean_smoker = 6.829, sd_smoker = 1.386
## Observed difference between means (nonsmoker-smoker) = 0.3155
## H0: mu_nonsmoker - mu_smoker = 0
## HA: mu_nonsmoker - mu_smoker != 0
## Standard error = 0.134
## Test statistic: Z = 2.359
## p-value = 0.0184
Changing the Order============================================ Add the order parameter to subtract the mean of non-smokers from smokers.
inference(y = nc$weight, x = nc$habit, est = "mean", type = "ht", null = 0,
alternative = "twosided", method = "theoretical", order = c("smoker", "nonsmoker"))
## Response variable: numerical, Explanatory variable: categorical
## Difference between two means
## Summary statistics:
## n_smoker = 126, mean_smoker = 6.829, sd_smoker = 1.386
## n_nonsmoker = 873, mean_nonsmoker = 7.144, sd_nonsmoker = 1.519
## Observed difference between means (smoker-nonsmoker) = -0.3155
## H0: mu_smoker - mu_nonsmoker = 0
## HA: mu_smoker - mu_nonsmoker != 0
## Standard error = 0.134
## Test statistic: Z = -2.359
## p-value = 0.0184
=====Question 7=====
inference(y = nc$weight, x = nc$habit, est = "mean", type = "ci", null = 0,
alternative = "twosided", method = "theoretical")
## Response variable: numerical, Explanatory variable: categorical
## Difference between two means
## Summary statistics:
## n_nonsmoker = 873, mean_nonsmoker = 7.144, sd_nonsmoker = 1.519
## n_smoker = 126, mean_smoker = 6.829, sd_smoker = 1.386
## Observed difference between means (nonsmoker-smoker) = 0.3155
## Standard error = 0.1338
## 95 % Confidence interval = ( 0.0534 , 0.5777 )
There is a 95% probability that babies born to non-smokers are between 0.05 and 0.58 pounds heavier than babies born to smokers. ======Question 8===== Inspect the levels of the variable
names(nc)
## [1] "fage" "mage" "mature" "weeks"
## [5] "premie" "visits" "marital" "gained"
## [9] "weight" "lowbirthweight" "gender" "habit"
## [13] "whitemom"
summary(nc$mature)
## mature mom younger mom
## 133 867
str(nc)
## 'data.frame': 1000 obs. of 13 variables:
## $ fage : int NA NA 19 21 NA NA 18 17 NA 20 ...
## $ mage : int 13 14 15 15 15 15 15 15 16 16 ...
## $ mature : Factor w/ 2 levels "mature mom","younger mom": 2 2 2 2 2 2 2 2 2 2 ...
## $ weeks : int 39 42 37 41 39 38 37 35 38 37 ...
## $ premie : Factor w/ 2 levels "full term","premie": 1 1 1 1 1 1 1 2 1 1 ...
## $ visits : int 10 15 11 6 9 19 12 5 9 13 ...
## $ marital : Factor w/ 2 levels "married","not married": 1 1 1 1 1 1 1 1 1 1 ...
## $ gained : int 38 20 38 34 27 22 76 15 NA 52 ...
## $ weight : num 7.63 7.88 6.63 8 6.38 5.38 8.44 4.69 8.81 6.94 ...
## $ lowbirthweight: Factor w/ 2 levels "low","not low": 2 2 2 2 2 1 2 1 2 2 ...
## $ gender : Factor w/ 2 levels "female","male": 2 2 1 2 1 2 2 2 2 1 ...
## $ habit : Factor w/ 2 levels "nonsmoker","smoker": 1 1 1 1 1 1 1 1 1 1 ...
## $ whitemom : Factor w/ 2 levels "not white","white": 1 1 2 2 1 1 1 1 2 2 ...
by(nc$mature, nc$age, mean)
## Error: 'names' attribute [1] must be the same length as the vector [0]
boxplot(nc$mature)
## Error: adding class "factor" to an invalid object
hist(nc$mature)
## Error: 'x' must be numeric
table(nc$mature, nc$age)
## Error: all arguments must have the same length
table(nc$age, nc$mature)
## Error: all arguments must have the same length
summary(nc$mature)
## mature mom younger mom
## 133 867
by(nc$mature, summary(nc$age))
## Error: arguments must have same length
by(nc$age, nc$mature, summary)
## Error: arguments must have same length
I can't figure out how to do this. I keep getting the 'must have same length' error message. =====GSS===== Load the GSS data set and inspect wordsum and class
load(url("http://s3.amazonaws.com/assets.datacamp.com/course/dasi/gss.Rdata"))
summary(gss$wordsum)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 5.00 6.00 6.14 7.00 10.00
summary(gss$class)
## LOWER MIDDLE UPPER WORKING
## 41 331 16 407
=====Analyze the Variables=====
summary(gss$wordsum)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 5.00 6.00 6.14 7.00 10.00
summary(gss$class)
## LOWER MIDDLE UPPER WORKING
## 41 331 16 407
plot(gss$wordsum) #not informative
hist(gss$wordsum)
hist(gss$class) #doesn't work
## Error: 'x' must be numeric
boxplot(gss$class) #doesn't work
## Error: adding class "factor" to an invalid object
by(gss$wordsum, gss$class, mean)
## gss$class: LOWER
## [1] 5.073
## --------------------------------------------------------
## gss$class: MIDDLE
## [1] 6.761
## --------------------------------------------------------
## gss$class: UPPER
## [1] 6.188
## --------------------------------------------------------
## gss$class: WORKING
## [1] 5.749
boxplot(gss$wordsum ~ gss$class)
=====Question 9===== We should use an ANOVA test to test the differences in a numerical variable between groups. =====ANOVA test===== F tests are always one sided, i.e., you have to specify whether something is bigger or smaller, not just different.
inference(y = gss$wordsum, x = gss$class, est = "mean", method = "theoretical",
type = "ht", alternative = "greater") # this is an F test
## Response variable: numerical, Explanatory variable: categorical
## ANOVA
## Summary statistics:
## n_LOWER = 41, mean_LOWER = 5.073, sd_LOWER = 2.24
## n_MIDDLE = 331, mean_MIDDLE = 6.761, sd_MIDDLE = 1.886
## n_UPPER = 16, mean_UPPER = 6.188, sd_UPPER = 2.344
## n_WORKING = 407, mean_WORKING = 5.749, sd_WORKING = 1.865
## H_0: All means are equal.
## H_A: At least one mean is different.
## Analysis of Variance Table
##
## Response: y
## Df Sum Sq Mean Sq F value Pr(>F)
## x 3 237 78.9 21.7 1.6e-13
## Residuals 791 2870 3.6
##
## Pairwise tests: t tests with pooled SD
## LOWER MIDDLE UPPER
## MIDDLE 0.0000 NA NA
## UPPER 0.0475 0.2396 NA
## WORKING 0.0306 0.0000 0.3671
=====Question 10===== Find the modified alpha.
inference(y = gss$wordsum, x = gss$class, est = "mean", method = "theoretical",
type = "ht", alternative = "greater")
## Response variable: numerical, Explanatory variable: categorical
## ANOVA
## Summary statistics:
## n_LOWER = 41, mean_LOWER = 5.073, sd_LOWER = 2.24
## n_MIDDLE = 331, mean_MIDDLE = 6.761, sd_MIDDLE = 1.886
## n_UPPER = 16, mean_UPPER = 6.188, sd_UPPER = 2.344
## n_WORKING = 407, mean_WORKING = 5.749, sd_WORKING = 1.865
## H_0: All means are equal.
## H_A: At least one mean is different.
## Analysis of Variance Table
##
## Response: y
## Df Sum Sq Mean Sq F value Pr(>F)
## x 3 237 78.9 21.7 1.6e-13
## Residuals 791 2870 3.6
##
## Pairwise tests: t tests with pooled SD
## LOWER MIDDLE UPPER
## MIDDLE 0.0000 NA NA
## UPPER 0.0475 0.2396 NA
## WORKING 0.0306 0.0000 0.3671
=====Question 11===== Middle and Lower are different, but it also seems that Middle and Working are lower, too. Why isn't that correct?