Chapter 1: Visualizing two variables
# Impot data
#ncbirths <- read.csv(paste(getwd(),"/resources/rstudio/business statistics/data/ncbirths.csv", sep = ""))
ncbirths <- read.csv("/resources/rstudio/business statistics/data/ncbirths.csv")
head(ncbirths)
## fage mage mature weeks premie visits marital gained weight
## 1 NA 13 younger mom 39 full term 10 married 38 7.63
## 2 NA 14 younger mom 42 full term 15 married 20 7.88
## 3 19 15 younger mom 37 full term 11 married 38 6.63
## 4 21 15 younger mom 41 full term 6 married 34 8.00
## 5 NA 15 younger mom 39 full term 9 married 27 6.38
## 6 NA 15 younger mom 38 full term 19 married 22 5.38
## lowbirthweight gender habit whitemom
## 1 not low male nonsmoker not white
## 2 not low male nonsmoker not white
## 3 not low female nonsmoker white
## 4 not low male nonsmoker white
## 5 not low female nonsmoker not white
## 6 low male nonsmoker not white
str(ncbirths)
## 'data.frame': 1000 obs. of 13 variables:
## $ fage : int NA NA 19 21 NA NA 18 17 NA 20 ...
## $ mage : int 13 14 15 15 15 15 15 15 16 16 ...
## $ mature : Factor w/ 2 levels "mature mom","younger mom": 2 2 2 2 2 2 2 2 2 2 ...
## $ weeks : int 39 42 37 41 39 38 37 35 38 37 ...
## $ premie : Factor w/ 3 levels "<NA>","full term",..: 2 2 2 2 2 2 2 3 2 2 ...
## $ visits : int 10 15 11 6 9 19 12 5 9 13 ...
## $ marital : Factor w/ 3 levels "<NA>","married",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ gained : int 38 20 38 34 27 22 76 15 NA 52 ...
## $ weight : num 7.63 7.88 6.63 8 6.38 5.38 8.44 4.69 8.81 6.94 ...
## $ lowbirthweight: Factor w/ 2 levels "low","not low": 2 2 2 2 2 1 2 1 2 2 ...
## $ gender : Factor w/ 2 levels "female","male": 2 2 1 2 1 2 2 2 2 1 ...
## $ habit : Factor w/ 3 levels "<NA>","nonsmoker",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ whitemom : Factor w/ 3 levels "<NA>","not white",..: 2 2 3 3 2 2 2 2 3 3 ...
#new variables
library(openintro)
library(ggplot2)
library(dplyr)
# Load data
data(countyComplete) # It comes from the openintro package
# Create a new variable, rural
countyComplete$rural <- ifelse(countyComplete$density < 500, "rural", "urban")
countyComplete$rural <- factor(countyComplete$rural)
# Scatterplot of per_capita_income vs. bachelors
ggplot(data = countyComplete, aes(x = per_capita_income, y = bachelors)) + geom_point()

# Boxplot of per_capita_income vs. bachelors
ggplot(data = countyComplete,
aes(x = cut(per_capita_income, breaks = 4), y = bachelors)) +
geom_boxplot()

# When interpreting the scatter plot and box plot you can see a correlation between how people have a bachelors and amount of money they make. On the scatter plot seeing that the people who make the most money have higher bachelors then the cluster of plots lower down with less bachelors making less money.
# The box plot shows this in a simpler way with the least amount of money being made by people with less than "20" bachelors with some outliers and the people making the most money with “40” or more bachelors.
# Body dimensions scatterplot
ggplot(data = countyComplete, aes(x = per_capita_income, y = bachelors, color = factor(rural))) +
geom_point()

# Load the package
library(dplyr)
# Compute correlation
countyComplete %>%
summarize(N = n(), r = cor(per_capita_income, bachelors))
## N r
## 1 3143 0.7924464
# Compute correlation for all non-missing pairs
countyComplete %>%
summarize(N = n(), r = cor(per_capita_income, bachelors, use = "pairwise.complete.obs"))
## N r
## 1 3143 0.7924464
# When comparing the correlation coefficient numbers between per_capita_income and bacheloers is a 0.79, on a scale of -1 to 1. this number shows a strong correlation to how much someone makes and what type of degree they have.