Task 2 Final Exam

I first loaded in the required packages and created a dataframe for the original.

Task 2.1
I created a new dataframe that omitted the variables I did not need by using the select function and I ran the correlations formula with the six variables selected.

states2 <- states %>% 
  select(Population, Income, Illiteracy, LifeExp, Murder, HSGrad)
states2

## Source: local data frame [50 x 6]
## 
##    Population Income Illiteracy LifeExp Murder HSGrad
## 1        3615   3624        2.1   69.05   15.1   41.3
## 2         365   6315        1.5   69.31   11.3   66.7
## 3        2212   4530        1.8   70.55    7.8   58.1
## 4        2110   3378        1.9   70.66   10.1   39.9
## 5       21198   5114        1.1   71.71   10.3   62.6
## 6        2541   4884        0.7   72.06    6.8   63.9
## 7        3100   5348        1.1   72.48    3.1   56.0
## 8         579   4809        0.9   70.06    6.2   54.6
## 9        8277   4815        1.3   70.66   10.7   52.6
## 10       4931   4091        2.0   68.54   13.9   40.6
## ..        ...    ...        ...     ...    ...    ...

summary(states2)

##    Population        Income       Illiteracy       LifeExp     
##  Min.   :  365   Min.   :3098   Min.   :0.500   Min.   :67.96  
##  1st Qu.: 1080   1st Qu.:3993   1st Qu.:0.625   1st Qu.:70.12  
##  Median : 2838   Median :4519   Median :0.950   Median :70.67  
##  Mean   : 4246   Mean   :4436   Mean   :1.170   Mean   :70.88  
##  3rd Qu.: 4968   3rd Qu.:4814   3rd Qu.:1.575   3rd Qu.:71.89  
##  Max.   :21198   Max.   :6315   Max.   :2.800   Max.   :73.60  
##      Murder           HSGrad     
##  Min.   : 1.400   Min.   :37.80  
##  1st Qu.: 4.350   1st Qu.:48.05  
##  Median : 6.850   Median :53.25  
##  Mean   : 7.378   Mean   :53.11  
##  3rd Qu.:10.675   3rd Qu.:59.15  
##  Max.   :15.100   Max.   :67.30

rquery.cormat(states2)

## $r
##            LifeExp Income HSGrad Population Illiteracy Murder
## LifeExp          1                                           
## Income        0.34      1                                    
## HSGrad        0.58   0.62      1                             
## Population  -0.068   0.21 -0.098          1                  
## Illiteracy   -0.59  -0.44  -0.66       0.11          1       
## Murder       -0.78  -0.23  -0.49       0.34        0.7      1
## 
## $p
##            LifeExp  Income  HSGrad Population Illiteracy Murder
## LifeExp          0                                             
## Income       0.016       0                                     
## HSGrad     9.2e-06 1.6e-06       0                             
## Population    0.64    0.15     0.5          0                  
## Illiteracy   7e-06  0.0015 2.2e-07       0.46          0       
## Murder     2.3e-11    0.11 0.00032      0.015    1.3e-08      0
## 
## $sym
##            LifeExp Income HSGrad Population Illiteracy Murder
## LifeExp    1                                                 
## Income     .       1                                         
## HSGrad     .       ,      1                                  
## Population                       1                           
## Illiteracy .       .      ,                 1                
## Murder     ,              .      .          ,          1     
## attr(,"legend")
## [1] 0 ' ' 0.3 '.' 0.6 ',' 0.8 '+' 0.9 '*' 0.95 'B' 1

Task 2.2ai

states2 %>% 
  ggvis(x = ~HSGrad, y = ~Income) %>% layer_points() %>% layer_model_predictions(model = "lm", se = TRUE) %>% add_axis("x", title = "Percentage of High School Graduates Per State", title_offset = 50) %>%
  add_axis("y", title = "Income Per Capita per State", title_offset = 50)

## Guessing formula = Income ~ HSGrad

High School Graduation Rates and Income Per Capita Task 2.2aii

states2 %>% #do with lm
  ggvis(x = ~Illiteracy, y = ~Income) %>% layer_points() %>% layer_model_predictions(model = "lm", se = TRUE) %>% add_axis("x", title = "Illiteracy Rate Per State", title_offset = 50) %>%
  add_axis("y", title = "Income Per Capita per State", title_offset = 50)

## Guessing formula = Income ~ Illiteracy

High School Graduation Rates and Income Per Capita

Task 2.2b
For this scatterplot, I grouped the High School Grad rate in two groups, those less than the median of 53.11 (1) and those equal to or greater than the median (0).

states3<-states2 %>% select(Illiteracy, Murder, HSGrad)
states3

## Source: local data frame [50 x 3]
## 
##    Illiteracy Murder HSGrad
## 1         2.1   15.1   41.3
## 2         1.5   11.3   66.7
## 3         1.8    7.8   58.1
## 4         1.9   10.1   39.9
## 5         1.1   10.3   62.6
## 6         0.7    6.8   63.9
## 7         1.1    3.1   56.0
## 8         0.9    6.2   54.6
## 9         1.3   10.7   52.6
## 10        2.0   13.9   40.6
## ..        ...    ...    ...

summary(states3)

##    Illiteracy        Murder           HSGrad     
##  Min.   :0.500   Min.   : 1.400   Min.   :37.80  
##  1st Qu.:0.625   1st Qu.: 4.350   1st Qu.:48.05  
##  Median :0.950   Median : 6.850   Median :53.25  
##  Mean   :1.170   Mean   : 7.378   Mean   :53.11  
##  3rd Qu.:1.575   3rd Qu.:10.675   3rd Qu.:59.15  
##  Max.   :2.800   Max.   :15.100   Max.   :67.30

states3$HSGrad <- ifelse(states3$HSGrad < 53.11, 1, states3$HSGrad)
states3$HSGrad <- ifelse(states3$HSGrad >= 53.11, 0, states3$HSGrad)
summary(states3)

##    Illiteracy        Murder           HSGrad    
##  Min.   :0.500   Min.   : 1.400   Min.   :0.00  
##  1st Qu.:0.625   1st Qu.: 4.350   1st Qu.:0.00  
##  Median :0.950   Median : 6.850   Median :0.00  
##  Mean   :1.170   Mean   : 7.378   Mean   :0.48  
##  3rd Qu.:1.575   3rd Qu.:10.675   3rd Qu.:1.00  
##  Max.   :2.800   Max.   :15.100   Max.   :1.00

states3 %>% ggvis(~Illiteracy, ~Murder, fill = ~factor(HSGrad)) %>% layer_points() %>% group_by(HSGrad)

Task 2.3a
Hypothesis (null) There is no difference in income between states below the median high school graduation of all states and those above the median high school graduation of all states.

Hypothesis (alternative) There is a difference in income between states below the median high school graduation of all states and those above the median high school graduation of all states.

To complete this test, a new dataframe was created in which two groups were created for HSGrad, those below the median High School Graduation rate of all states and those above the median High School Graduation rate of all states.

IncomeGrad <-states %>% select(Income, HSGrad)
IncomeGrad

## Source: local data frame [50 x 2]
## 
##    Income HSGrad
## 1    3624   41.3
## 2    6315   66.7
## 3    4530   58.1
## 4    3378   39.9
## 5    5114   62.6
## 6    4884   63.9
## 7    5348   56.0
## 8    4809   54.6
## 9    4815   52.6
## 10   4091   40.6
## ..    ...    ...

IncomeGrad$HSGrad1<- 1
IncomeGrad$HSGrad1 <- ifelse(IncomeGrad$HSGrad <= 53.25, 0, IncomeGrad$HSGrad1)
IncomeGrad$HSGrad1 <- ifelse(IncomeGrad$HSGrad > 53.25, 1, IncomeGrad$HSGrad1)
IncomeGrad$HSGrad1

##  [1] 0 1 1 0 1 1 1 1 0 0 1 1 0 0 1 1 0 0 1 0 1 0 1 0 0 1 1 1 1 0 1 0 0 0 0
## [36] 0 1 0 0 0 1 0 0 1 1 0 1 0 1 1

glimpse(IncomeGrad)

## Observations: 50
## Variables:
## $ Income  (int) 3624, 6315, 4530, 3378, 5114, 4884, 5348, 4809, 4815, ...
## $ HSGrad  (dbl) 41.3, 66.7, 58.1, 39.9, 62.6, 63.9, 56.0, 54.6, 52.6, ...
## $ HSGrad1 (dbl) 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, ...

summarize(IncomeGrad)

## data frame with 0 columns and 0 rows

summary(IncomeGrad$HSGrad1)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0     0.0     0.5     0.5     1.0     1.0

A t-test will be used to analyze the data at an alpha level of .05

t.test(Income ~ HSGrad1, IncomeGrad, var.equal=TRUE)

## 
##  Two Sample t-test
## 
## data:  Income by HSGrad1
## t = -1.9642, df = 48, p-value = 0.05531
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -671.518476    7.838476
## sample estimates:
## mean in group 0 mean in group 1 
##         4269.88         4601.72

I fail to reject the null hypothesis at the specified .05 level, t=-2.0, p >.05, 95% CI[-671.52, 7.83].

Task 2.3b
Hypothesis (null) There is no difference in murder rates between one group of states (Alabama, Alaska, Arkansas, Georgia, Illinois, Kentucky, Louisiana, Mississippi, and Michigan) and a second group of states (Arizona, Connecticut, Iowa, Kansas, Maine, Minnesota, Nebraska, New Hampshire, North Dakota)

Hypothesis (alternative) There is a difference in murder rates between one group of states (Alabama, Alaska, Arkansas, Georgia, Illinois, Kentucky, Louisiana, Mississippi, and Michigan) and a second group of states (Arizona, Connecticut, Iowa, Kansas, Maine, Minnesota, Nebraska, New Hampshire, North Dakota)

To complete this test, the state names, which is character data had to be recoded. Below is the code used to recode the data into two groups.

MurderStates <-states %>% select(Murder, stateNames)
MurderStates

## Source: local data frame [50 x 2]
## 
##    Murder  stateNames
## 1    15.1     Alabama
## 2    11.3      Alaska
## 3     7.8     Arizona
## 4    10.1    Arkansas
## 5    10.3  California
## 6     6.8    Colorado
## 7     3.1 Connecticut
## 8     6.2    Delaware
## 9    10.7     Florida
## 10   13.9     Georgia
## ..    ...         ...

glimpse(MurderStates)

## Observations: 50
## Variables:
## $ Murder     (dbl) 15.1, 11.3, 7.8, 10.1, 10.3, 6.8, 3.1, 6.2, 10.7, 1...
## $ stateNames (fctr) Alabama, Alaska, Arizona, Arkansas, California, Co...

summarise(MurderStates)

## data frame with 0 columns and 0 rows

MurderStates$scode[MurderStates$stateNames=="Alabama"]<- 0 
MurderStates$scode[MurderStates$stateNames=="Alaska"]<- 0
MurderStates$scode[MurderStates$stateNames=="Georgia"]<- 0
MurderStates$scode[MurderStates$stateNames=="Illinois"]<- 0
MurderStates$scode[MurderStates$stateNames=="Kentucky"]<- 0
MurderStates$scode[MurderStates$stateNames=="Louisiana"]<- 0
MurderStates$scode[MurderStates$stateNames=="Mississippi"]<- 0
MurderStates$scode[MurderStates$stateNames=="Michigan"]<- 0

MurderStates$scode[MurderStates$stateNames=="Arizona"]<- 1 
MurderStates$scode[MurderStates$stateNames=="Connecticut"]<- 1 
MurderStates$scode[MurderStates$stateNames=="Iowa"]<- 1 
MurderStates$scode[MurderStates$stateNames=="Kansas"]<- 1 
MurderStates$scode[MurderStates$stateNames=="Maine"]<- 1 
MurderStates$scode[MurderStates$stateNames=="Minnesota"]<- 1 
MurderStates$scode[MurderStates$stateNames=="Nebraska"]<- 1
MurderStates$scode[MurderStates$stateNames=="New Hampshire"]<- 1 
MurderStates$scode[MurderStates$stateNames=="North Dakota"]<- 1 

MurderStates$scode<- factor(MurderStates$scode)

A t-test will be used to analyze the data at an alpha level of .05

t.test(Murder ~ scode, MurderStates, var.equal=TRUE)

## 
##  Two Sample t-test
## 
## data:  Murder by scode
## t = 10.18, df = 15, p-value = 3.952e-08
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##   7.023301 10.743366
## sample estimates:
## mean in group 0 mean in group 1 
##       12.250000        3.366667

The null hypothesis is rejected at the specified .05 level, t=10.18, p <.05, 95% CI[12.25, 3.37].

Task 2 Final Exam

Meg Handley

November 17, 2015