I first loaded in the required packages and created a dataframe for the original.
Task 2.1
I created a new dataframe that omitted the variables I did not need by using the select function and I ran the correlations formula with the six variables selected.
states2 <- states %>%
select(Population, Income, Illiteracy, LifeExp, Murder, HSGrad)
states2
## Source: local data frame [50 x 6]
##
## Population Income Illiteracy LifeExp Murder HSGrad
## 1 3615 3624 2.1 69.05 15.1 41.3
## 2 365 6315 1.5 69.31 11.3 66.7
## 3 2212 4530 1.8 70.55 7.8 58.1
## 4 2110 3378 1.9 70.66 10.1 39.9
## 5 21198 5114 1.1 71.71 10.3 62.6
## 6 2541 4884 0.7 72.06 6.8 63.9
## 7 3100 5348 1.1 72.48 3.1 56.0
## 8 579 4809 0.9 70.06 6.2 54.6
## 9 8277 4815 1.3 70.66 10.7 52.6
## 10 4931 4091 2.0 68.54 13.9 40.6
## .. ... ... ... ... ... ...
summary(states2)
## Population Income Illiteracy LifeExp
## Min. : 365 Min. :3098 Min. :0.500 Min. :67.96
## 1st Qu.: 1080 1st Qu.:3993 1st Qu.:0.625 1st Qu.:70.12
## Median : 2838 Median :4519 Median :0.950 Median :70.67
## Mean : 4246 Mean :4436 Mean :1.170 Mean :70.88
## 3rd Qu.: 4968 3rd Qu.:4814 3rd Qu.:1.575 3rd Qu.:71.89
## Max. :21198 Max. :6315 Max. :2.800 Max. :73.60
## Murder HSGrad
## Min. : 1.400 Min. :37.80
## 1st Qu.: 4.350 1st Qu.:48.05
## Median : 6.850 Median :53.25
## Mean : 7.378 Mean :53.11
## 3rd Qu.:10.675 3rd Qu.:59.15
## Max. :15.100 Max. :67.30
rquery.cormat(states2)
## $r
## LifeExp Income HSGrad Population Illiteracy Murder
## LifeExp 1
## Income 0.34 1
## HSGrad 0.58 0.62 1
## Population -0.068 0.21 -0.098 1
## Illiteracy -0.59 -0.44 -0.66 0.11 1
## Murder -0.78 -0.23 -0.49 0.34 0.7 1
##
## $p
## LifeExp Income HSGrad Population Illiteracy Murder
## LifeExp 0
## Income 0.016 0
## HSGrad 9.2e-06 1.6e-06 0
## Population 0.64 0.15 0.5 0
## Illiteracy 7e-06 0.0015 2.2e-07 0.46 0
## Murder 2.3e-11 0.11 0.00032 0.015 1.3e-08 0
##
## $sym
## LifeExp Income HSGrad Population Illiteracy Murder
## LifeExp 1
## Income . 1
## HSGrad . , 1
## Population 1
## Illiteracy . . , 1
## Murder , . . , 1
## attr(,"legend")
## [1] 0 ' ' 0.3 '.' 0.6 ',' 0.8 '+' 0.9 '*' 0.95 'B' 1
Task 2.2aistates2 %>%
ggvis(x = ~HSGrad, y = ~Income) %>% layer_points() %>% layer_model_predictions(model = "lm", se = TRUE) %>% add_axis("x", title = "Percentage of High School Graduates Per State", title_offset = 50) %>%
add_axis("y", title = "Income Per Capita per State", title_offset = 50)
## Guessing formula = Income ~ HSGrad
states2 %>% #do with lm
ggvis(x = ~Illiteracy, y = ~Income) %>% layer_points() %>% layer_model_predictions(model = "lm", se = TRUE) %>% add_axis("x", title = "Illiteracy Rate Per State", title_offset = 50) %>%
add_axis("y", title = "Income Per Capita per State", title_offset = 50)
## Guessing formula = Income ~ Illiteracy
Task 2.2b
For this scatterplot, I grouped the High School Grad rate in two groups, those less than the median of 53.11 (1) and those equal to or greater than the median (0).
states3<-states2 %>% select(Illiteracy, Murder, HSGrad)
states3
## Source: local data frame [50 x 3]
##
## Illiteracy Murder HSGrad
## 1 2.1 15.1 41.3
## 2 1.5 11.3 66.7
## 3 1.8 7.8 58.1
## 4 1.9 10.1 39.9
## 5 1.1 10.3 62.6
## 6 0.7 6.8 63.9
## 7 1.1 3.1 56.0
## 8 0.9 6.2 54.6
## 9 1.3 10.7 52.6
## 10 2.0 13.9 40.6
## .. ... ... ...
summary(states3)
## Illiteracy Murder HSGrad
## Min. :0.500 Min. : 1.400 Min. :37.80
## 1st Qu.:0.625 1st Qu.: 4.350 1st Qu.:48.05
## Median :0.950 Median : 6.850 Median :53.25
## Mean :1.170 Mean : 7.378 Mean :53.11
## 3rd Qu.:1.575 3rd Qu.:10.675 3rd Qu.:59.15
## Max. :2.800 Max. :15.100 Max. :67.30
states3$HSGrad <- ifelse(states3$HSGrad < 53.11, 1, states3$HSGrad)
states3$HSGrad <- ifelse(states3$HSGrad >= 53.11, 0, states3$HSGrad)
summary(states3)
## Illiteracy Murder HSGrad
## Min. :0.500 Min. : 1.400 Min. :0.00
## 1st Qu.:0.625 1st Qu.: 4.350 1st Qu.:0.00
## Median :0.950 Median : 6.850 Median :0.00
## Mean :1.170 Mean : 7.378 Mean :0.48
## 3rd Qu.:1.575 3rd Qu.:10.675 3rd Qu.:1.00
## Max. :2.800 Max. :15.100 Max. :1.00
states3 %>% ggvis(~Illiteracy, ~Murder, fill = ~factor(HSGrad)) %>% layer_points() %>% group_by(HSGrad)
Task 2.3a
Hypothesis (null) There is no difference in income between states below the median high school graduation of all states and those above the median high school graduation of all states.
Hypothesis (alternative) There is a difference in income between states below the median high school graduation of all states and those above the median high school graduation of all states.
To complete this test, a new dataframe was created in which two groups were created for HSGrad, those below the median High School Graduation rate of all states and those above the median High School Graduation rate of all states.
IncomeGrad <-states %>% select(Income, HSGrad)
IncomeGrad
## Source: local data frame [50 x 2]
##
## Income HSGrad
## 1 3624 41.3
## 2 6315 66.7
## 3 4530 58.1
## 4 3378 39.9
## 5 5114 62.6
## 6 4884 63.9
## 7 5348 56.0
## 8 4809 54.6
## 9 4815 52.6
## 10 4091 40.6
## .. ... ...
IncomeGrad$HSGrad1<- 1
IncomeGrad$HSGrad1 <- ifelse(IncomeGrad$HSGrad <= 53.25, 0, IncomeGrad$HSGrad1)
IncomeGrad$HSGrad1 <- ifelse(IncomeGrad$HSGrad > 53.25, 1, IncomeGrad$HSGrad1)
IncomeGrad$HSGrad1
## [1] 0 1 1 0 1 1 1 1 0 0 1 1 0 0 1 1 0 0 1 0 1 0 1 0 0 1 1 1 1 0 1 0 0 0 0
## [36] 0 1 0 0 0 1 0 0 1 1 0 1 0 1 1
glimpse(IncomeGrad)
## Observations: 50
## Variables:
## $ Income (int) 3624, 6315, 4530, 3378, 5114, 4884, 5348, 4809, 4815, ...
## $ HSGrad (dbl) 41.3, 66.7, 58.1, 39.9, 62.6, 63.9, 56.0, 54.6, 52.6, ...
## $ HSGrad1 (dbl) 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, ...
summarize(IncomeGrad)
## data frame with 0 columns and 0 rows
summary(IncomeGrad$HSGrad1)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 0.0 0.5 0.5 1.0 1.0
A t-test will be used to analyze the data at an alpha level of .05
t.test(Income ~ HSGrad1, IncomeGrad, var.equal=TRUE)
##
## Two Sample t-test
##
## data: Income by HSGrad1
## t = -1.9642, df = 48, p-value = 0.05531
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -671.518476 7.838476
## sample estimates:
## mean in group 0 mean in group 1
## 4269.88 4601.72
I fail to reject the null hypothesis at the specified .05 level, t=-2.0, p >.05, 95% CI[-671.52, 7.83].
Task 2.3b
Hypothesis (null) There is no difference in murder rates between one group of states (Alabama, Alaska, Arkansas, Georgia, Illinois, Kentucky, Louisiana, Mississippi, and Michigan) and a second group of states (Arizona, Connecticut, Iowa, Kansas, Maine, Minnesota, Nebraska, New Hampshire, North Dakota)
Hypothesis (alternative) There is a difference in murder rates between one group of states (Alabama, Alaska, Arkansas, Georgia, Illinois, Kentucky, Louisiana, Mississippi, and Michigan) and a second group of states (Arizona, Connecticut, Iowa, Kansas, Maine, Minnesota, Nebraska, New Hampshire, North Dakota)
To complete this test, the state names, which is character data had to be recoded. Below is the code used to recode the data into two groups.
MurderStates <-states %>% select(Murder, stateNames)
MurderStates
## Source: local data frame [50 x 2]
##
## Murder stateNames
## 1 15.1 Alabama
## 2 11.3 Alaska
## 3 7.8 Arizona
## 4 10.1 Arkansas
## 5 10.3 California
## 6 6.8 Colorado
## 7 3.1 Connecticut
## 8 6.2 Delaware
## 9 10.7 Florida
## 10 13.9 Georgia
## .. ... ...
glimpse(MurderStates)
## Observations: 50
## Variables:
## $ Murder (dbl) 15.1, 11.3, 7.8, 10.1, 10.3, 6.8, 3.1, 6.2, 10.7, 1...
## $ stateNames (fctr) Alabama, Alaska, Arizona, Arkansas, California, Co...
summarise(MurderStates)
## data frame with 0 columns and 0 rows
MurderStates$scode[MurderStates$stateNames=="Alabama"]<- 0
MurderStates$scode[MurderStates$stateNames=="Alaska"]<- 0
MurderStates$scode[MurderStates$stateNames=="Georgia"]<- 0
MurderStates$scode[MurderStates$stateNames=="Illinois"]<- 0
MurderStates$scode[MurderStates$stateNames=="Kentucky"]<- 0
MurderStates$scode[MurderStates$stateNames=="Louisiana"]<- 0
MurderStates$scode[MurderStates$stateNames=="Mississippi"]<- 0
MurderStates$scode[MurderStates$stateNames=="Michigan"]<- 0
MurderStates$scode[MurderStates$stateNames=="Arizona"]<- 1
MurderStates$scode[MurderStates$stateNames=="Connecticut"]<- 1
MurderStates$scode[MurderStates$stateNames=="Iowa"]<- 1
MurderStates$scode[MurderStates$stateNames=="Kansas"]<- 1
MurderStates$scode[MurderStates$stateNames=="Maine"]<- 1
MurderStates$scode[MurderStates$stateNames=="Minnesota"]<- 1
MurderStates$scode[MurderStates$stateNames=="Nebraska"]<- 1
MurderStates$scode[MurderStates$stateNames=="New Hampshire"]<- 1
MurderStates$scode[MurderStates$stateNames=="North Dakota"]<- 1
MurderStates$scode<- factor(MurderStates$scode)
A t-test will be used to analyze the data at an alpha level of .05
t.test(Murder ~ scode, MurderStates, var.equal=TRUE)
##
## Two Sample t-test
##
## data: Murder by scode
## t = 10.18, df = 15, p-value = 3.952e-08
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 7.023301 10.743366
## sample estimates:
## mean in group 0 mean in group 1
## 12.250000 3.366667
The null hypothesis is rejected at the specified .05 level, t=10.18, p <.05, 95% CI[12.25, 3.37].