load(url("http://s3.amazonaws.com/assets.datacamp.com/course/dasi/nc.Rdata"))
names(nc)

##  [1] "fage"           "mage"           "mature"         "weeks"         
##  [5] "premie"         "visits"         "marital"        "gained"        
##  [9] "weight"         "lowbirthweight" "gender"         "habit"         
## [13] "whitemom"

# structure(nc) gives cases
str(nc)

## 'data.frame':    1000 obs. of  13 variables:
##  $ fage          : int  NA NA 19 21 NA NA 18 17 NA 20 ...
##  $ mage          : int  13 14 15 15 15 15 15 15 16 16 ...
##  $ mature        : Factor w/ 2 levels "mature mom","younger mom": 2 2 2 2 2 2 2 2 2 2 ...
##  $ weeks         : int  39 42 37 41 39 38 37 35 38 37 ...
##  $ premie        : Factor w/ 2 levels "full term","premie": 1 1 1 1 1 1 1 2 1 1 ...
##  $ visits        : int  10 15 11 6 9 19 12 5 9 13 ...
##  $ marital       : Factor w/ 2 levels "married","not married": 1 1 1 1 1 1 1 1 1 1 ...
##  $ gained        : int  38 20 38 34 27 22 76 15 NA 52 ...
##  $ weight        : num  7.63 7.88 6.63 8 6.38 5.38 8.44 4.69 8.81 6.94 ...
##  $ lowbirthweight: Factor w/ 2 levels "low","not low": 2 2 2 2 2 1 2 1 2 2 ...
##  $ gender        : Factor w/ 2 levels "female","male": 2 2 1 2 1 2 2 2 2 1 ...
##  $ habit         : Factor w/ 2 levels "nonsmoker","smoker": 1 1 1 1 1 1 1 1 1 1 ...
##  $ whitemom      : Factor w/ 2 levels "not white","white": 1 1 2 2 1 1 1 1 2 2 ...

summary(nc$gained)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     0.0    20.0    30.0    30.3    38.0    85.0      27

hist(nc$gained)

plot of chunk unnamed-chunk-1

boxplot(nc$gained)

plot of chunk unnamed-chunk-1

Now get the cleaned data for the variable gained.

cleaned_gained <- na.omit(nc$gained)
n = length(cleaned_gained)

Bootstrap samples are samples with replacement.

boot_means = rep(NA, 100)

for (i in 1:100) {
    samp = sample(cleaned_gained, n, replace = TRUE)
    boot_means[i] <- mean(samp)
}
# print(boot_means)
hist(boot_means)

plot of chunk unnamed-chunk-3

Now load the inference function from the Data Camp assets

load(url("http://s3.amazonaws.com/assets.datacamp.com/course/dasi/inference.Rdata"))

Here is the code for the inference function: inference(nc$gained, type = “ci”, method = “simulation”, conflevel = 0.9, est = “mean”, boot_method = “perc”) I don't know what the 'boot_method = “perc” means.

inference(nc$gained, type = "ci", method = "simulation", conflevel = 0.9, est = "mean", 
    boot_method = "perc")

## Single mean 
## Summary statistics:

## mean = 30.3258 ;  sd = 14.2413 ;  n = 973

plot of chunk unnamed-chunk-5

## Bootstrap method: Percentile
## 90 % Bootstrap interval = ( 29.5642 , 31.0763 )

Now for a 95% confidence interval

inference(nc$gained, type = "ci", method = "simulation", conflevel = 0.95, est = "mean", 
    boot_method = "perc")

## Single mean 
## Summary statistics:

## mean = 30.3258 ;  sd = 14.2413 ;  n = 973

plot of chunk unnamed-chunk-6

## Bootstrap method: Percentile
## 95 % Bootstrap interval = ( 29.4425 , 31.2415 )

Notice how the interval gets larger. Now try it with the standard error method.

inference(nc$gained, type = "ci", method = "simulation", conflevel = 0.9, est = "mean", 
    boot_method = "se")

## Single mean 
## Summary statistics:

## mean = 30.3258 ;  sd = 14.2413 ;  n = 973

plot of chunk unnamed-chunk-7

## Bootstrap method: Standard error; Boot. mean = 30.3455 ; Boot. SE = 0.4457 
## 90 % Bootstrap interval = ( 29.6124 , 31.0786 )

Now estimate the confidence interval around the median instead of the mean.

inference(nc$gained, type = "ci", method = "simulation", conflevel = 0.9, est = "median", 
    boot_method = "perc")

## Single median 
## Summary statistics:

## median = 30 ;  n = 973

plot of chunk unnamed-chunk-8

## Bootstrap method: Percentile
## 90 % Bootstrap interval = ( 29 , 30 )

Notice that this time it returns integers because the median itself is an integer. Father's Age: fage Use the inference function to create a 95% bootstrap CI.

inference(nc$fage, type = "ci", method = "simulation", conflevel = 0.95, est = "mean", 
    boot_method = "se")

## Single mean 
## Summary statistics:

## mean = 30.2557 ;  sd = 6.7638 ;  n = 829

plot of chunk unnamed-chunk-9

## Bootstrap method: Standard error; Boot. mean = 30.2534 ; Boot. SE = 0.2416 
## 95 % Bootstrap interval = ( 29.7798 , 30.7269 )

Relationship between two variables: categorical versus numerical.

plot(nc$habit, nc$weight)  #default for plot assigns the first variable to the x axis.

plot of chunk unnamed-chunk-10

by function–compares means of two distributions by spliting the catagorical variable by its levels and looking at the mean of the interval level variable (i.e, the numerica variable) for each category.

by(nc$weight, nc$habit, mean)

## nc$habit: nonsmoker
## [1] 7.144
## -------------------------------------------------------- 
## nc$habit: smoker
## [1] 6.829

# this takes the second variable as the x axis, acting like a 'function of'
# type procedure.

Conditions for Inference======================================

by(nc$weight, nc$habit, length)

## nc$habit: nonsmoker
## [1] 873
## -------------------------------------------------------- 
## nc$habit: smoker
## [1] 126

So there are 873 cases where the mother was a non-smoker and 126 cases where the mother smoked. More inference================================================ Arguments for Inference function 1 response variable 2 grouping variable 3 the parameter we are interested in 4 type of inference: hypothesis or CI 5 null hypothesis 6 alternative hypothesis 7 method: theoretical or simulation 8 order, which gets subtracted from which–optional

inference(y = nc$weight, x = nc$habit, est = "mean", type = "ht", null = 0, 
    alternative = "twosided", method = "theoretical")

## Response variable: numerical, Explanatory variable: categorical
## Difference between two means
## Summary statistics:
## n_nonsmoker = 873, mean_nonsmoker = 7.144, sd_nonsmoker = 1.519
## n_smoker = 126, mean_smoker = 6.829, sd_smoker = 1.386

## Observed difference between means (nonsmoker-smoker) = 0.3155
## H0: mu_nonsmoker - mu_smoker = 0 
## HA: mu_nonsmoker - mu_smoker != 0 
## Standard error = 0.134 
## Test statistic: Z =  2.359 
## p-value =  0.0184

plot of chunk unnamed-chunk-13

Changing the Order============================================ Add the order parameter to subtract the mean of non-smokers from smokers.

inference(y = nc$weight, x = nc$habit, est = "mean", type = "ht", null = 0, 
    alternative = "twosided", method = "theoretical", order = c("smoker", "nonsmoker"))

## Response variable: numerical, Explanatory variable: categorical
## Difference between two means
## Summary statistics:
## n_smoker = 126, mean_smoker = 6.829, sd_smoker = 1.386
## n_nonsmoker = 873, mean_nonsmoker = 7.144, sd_nonsmoker = 1.519

## Observed difference between means (smoker-nonsmoker) = -0.3155
## H0: mu_smoker - mu_nonsmoker = 0 
## HA: mu_smoker - mu_nonsmoker != 0 
## Standard error = 0.134 
## Test statistic: Z =  -2.359 
## p-value =  0.0184

plot of chunk unnamed-chunk-14

=====Question 7=====

inference(y = nc$weight, x = nc$habit, est = "mean", type = "ci", null = 0, 
    alternative = "twosided", method = "theoretical")

## Response variable: numerical, Explanatory variable: categorical
## Difference between two means
## Summary statistics:
## n_nonsmoker = 873, mean_nonsmoker = 7.144, sd_nonsmoker = 1.519
## n_smoker = 126, mean_smoker = 6.829, sd_smoker = 1.386

plot of chunk unnamed-chunk-15

## Observed difference between means (nonsmoker-smoker) = 0.3155
## Standard error = 0.1338 
## 95 % Confidence interval = ( 0.0534 , 0.5777 )

There is a 95% probability that babies born to non-smokers are between 0.05 and 0.58 pounds heavier than babies born to smokers. ======Question 8===== Inspect the levels of the variable

names(nc)

##  [1] "fage"           "mage"           "mature"         "weeks"         
##  [5] "premie"         "visits"         "marital"        "gained"        
##  [9] "weight"         "lowbirthweight" "gender"         "habit"         
## [13] "whitemom"

summary(nc$mature)

##  mature mom younger mom 
##         133         867

str(nc)

## 'data.frame':    1000 obs. of  13 variables:
##  $ fage          : int  NA NA 19 21 NA NA 18 17 NA 20 ...
##  $ mage          : int  13 14 15 15 15 15 15 15 16 16 ...
##  $ mature        : Factor w/ 2 levels "mature mom","younger mom": 2 2 2 2 2 2 2 2 2 2 ...
##  $ weeks         : int  39 42 37 41 39 38 37 35 38 37 ...
##  $ premie        : Factor w/ 2 levels "full term","premie": 1 1 1 1 1 1 1 2 1 1 ...
##  $ visits        : int  10 15 11 6 9 19 12 5 9 13 ...
##  $ marital       : Factor w/ 2 levels "married","not married": 1 1 1 1 1 1 1 1 1 1 ...
##  $ gained        : int  38 20 38 34 27 22 76 15 NA 52 ...
##  $ weight        : num  7.63 7.88 6.63 8 6.38 5.38 8.44 4.69 8.81 6.94 ...
##  $ lowbirthweight: Factor w/ 2 levels "low","not low": 2 2 2 2 2 1 2 1 2 2 ...
##  $ gender        : Factor w/ 2 levels "female","male": 2 2 1 2 1 2 2 2 2 1 ...
##  $ habit         : Factor w/ 2 levels "nonsmoker","smoker": 1 1 1 1 1 1 1 1 1 1 ...
##  $ whitemom      : Factor w/ 2 levels "not white","white": 1 1 2 2 1 1 1 1 2 2 ...

by(nc$mature, nc$age, mean)

## Error: 'names' attribute [1] must be the same length as the vector [0]

boxplot(nc$mature)

## Error: adding class "factor" to an invalid object

hist(nc$mature)

## Error: 'x' must be numeric

table(nc$mature, nc$age)

## Error: all arguments must have the same length

table(nc$age, nc$mature)

## Error: all arguments must have the same length

summary(nc$mature)

##  mature mom younger mom 
##         133         867

by(nc$mature, summary(nc$age))

## Error: arguments must have same length

by(nc$age, nc$mature, summary)

## Error: arguments must have same length

I can't figure out how to do this. I keep getting the 'must have same length' error message. =====GSS===== Load the GSS data set and inspect wordsum and class

load(url("http://s3.amazonaws.com/assets.datacamp.com/course/dasi/gss.Rdata"))
summary(gss$wordsum)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    5.00    6.00    6.14    7.00   10.00

summary(gss$class)

##   LOWER  MIDDLE   UPPER WORKING 
##      41     331      16     407

=====Analyze the Variables=====

summary(gss$wordsum)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    5.00    6.00    6.14    7.00   10.00

summary(gss$class)

##   LOWER  MIDDLE   UPPER WORKING 
##      41     331      16     407

plot(gss$wordsum)  #not informative

plot of chunk unnamed-chunk-18

hist(gss$wordsum)

plot of chunk unnamed-chunk-18

hist(gss$class)  #doesn't work

## Error: 'x' must be numeric

boxplot(gss$class)  #doesn't work

## Error: adding class "factor" to an invalid object

by(gss$wordsum, gss$class, mean)

## gss$class: LOWER
## [1] 5.073
## -------------------------------------------------------- 
## gss$class: MIDDLE
## [1] 6.761
## -------------------------------------------------------- 
## gss$class: UPPER
## [1] 6.188
## -------------------------------------------------------- 
## gss$class: WORKING
## [1] 5.749

boxplot(gss$wordsum ~ gss$class)

plot of chunk unnamed-chunk-18

=====Question 9===== We should use an ANOVA test to test the differences in a numerical variable between groups. =====ANOVA test===== F tests are always one sided, i.e., you have to specify whether something is bigger or smaller, not just different.

inference(y = gss$wordsum, x = gss$class, est = "mean", method = "theoretical", 
    type = "ht", alternative = "greater")  # this is an F test

## Response variable: numerical, Explanatory variable: categorical
## ANOVA
## Summary statistics:
## n_LOWER = 41, mean_LOWER = 5.073, sd_LOWER = 2.24
## n_MIDDLE = 331, mean_MIDDLE = 6.761, sd_MIDDLE = 1.886
## n_UPPER = 16, mean_UPPER = 6.188, sd_UPPER = 2.344
## n_WORKING = 407, mean_WORKING = 5.749, sd_WORKING = 1.865

## H_0: All means are equal.
## H_A: At least one mean is different.
## Analysis of Variance Table
## 
## Response: y
##            Df Sum Sq Mean Sq F value  Pr(>F)
## x           3    237    78.9    21.7 1.6e-13
## Residuals 791   2870     3.6                
## 
## Pairwise tests: t tests with pooled SD 
##          LOWER MIDDLE  UPPER
## MIDDLE  0.0000     NA     NA
## UPPER   0.0475 0.2396     NA
## WORKING 0.0306 0.0000 0.3671

plot of chunk unnamed-chunk-19

=====Question 10===== Find the modified alpha.

inference(y = gss$wordsum, x = gss$class, est = "mean", method = "theoretical", 
    type = "ht", alternative = "greater")

## Response variable: numerical, Explanatory variable: categorical
## ANOVA
## Summary statistics:
## n_LOWER = 41, mean_LOWER = 5.073, sd_LOWER = 2.24
## n_MIDDLE = 331, mean_MIDDLE = 6.761, sd_MIDDLE = 1.886
## n_UPPER = 16, mean_UPPER = 6.188, sd_UPPER = 2.344
## n_WORKING = 407, mean_WORKING = 5.749, sd_WORKING = 1.865

## H_0: All means are equal.
## H_A: At least one mean is different.
## Analysis of Variance Table
## 
## Response: y
##            Df Sum Sq Mean Sq F value  Pr(>F)
## x           3    237    78.9    21.7 1.6e-13
## Residuals 791   2870     3.6                
## 
## Pairwise tests: t tests with pooled SD 
##          LOWER MIDDLE  UPPER
## MIDDLE  0.0000     NA     NA
## UPPER   0.0475 0.2396     NA
## WORKING 0.0306 0.0000 0.3671

plot of chunk unnamed-chunk-20

=====Question 11===== Middle and Lower are different, but it also seems that Middle and Working are lower, too. Why isn't that correct?