load("~/Dropbox/Documents/SMU/CSC 360-530/cdc.Rdata")
load("~/Dropbox/Documents/Pierce/Spring 20 146/OAW.Rdata")
load("county.rda")
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.0 ✓ purrr 0.3.4
## ✓ tibble 3.0.1 ✓ dplyr 0.8.5
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
Let’s look at the CDC Data
str(cdc)
## 'data.frame': 20000 obs. of 9 variables:
## $ genhlth : Factor w/ 5 levels "excellent","very good",..: 3 3 3 3 2 2 2 2 3 3 ...
## $ exerany : num 0 0 1 1 0 1 1 0 0 1 ...
## $ hlthplan: num 1 1 1 1 1 1 1 1 1 1 ...
## $ smoke100: num 0 1 1 0 0 0 0 0 1 0 ...
## $ height : num 70 64 60 66 61 64 71 67 65 70 ...
## $ weight : int 175 125 105 132 150 114 194 170 150 180 ...
## $ wtdesire: int 175 115 105 124 130 114 185 160 130 170 ...
## $ age : int 77 33 49 42 55 55 31 45 27 44 ...
## $ gender : Factor w/ 2 levels "m","f": 1 2 2 2 2 2 1 1 2 1 ...
Use height to demosnstrate viewing a single quantitative variable.
summary(cdc$height)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 48.00 64.00 67.00 67.18 70.00 93.00
ggplot(data = cdc) +
geom_histogram(aes(x = height))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Adjust the bins down. Play to find a good value.
ggplot(data = cdc) +
geom_histogram(aes(x = height),bins = 15)
Try geom_density() instead.
ggplot(data = cdc) +
geom_density(aes(x = height))
Read the documentation at https://ggplot2.tidyverse.org/reference/geom_density.html and look at the adjust parameter. Play with it to find a value you like.
ggplot(data = cdc) +
geom_density(aes(x = height),adjust =.5)
ggplot(data = cdc) +
geom_density(aes(x = height),adjust = 1.5)
Get a table of the specific values of height between 68 and 75 inches.
suspicious = filter(cdc, height >= 68 & height <= 75)
table(suspicious$height)
##
## 68 69 70 71 72 73 74 75
## 1505 1380 1500 1296 1393 784 605 321
Compare cdc$height with a random sample from a normal distribution with the same mean and standard deviation.
First, get the normal numbers.
fakes = rnorm(20000,mean = mean(cdc$height), sd = sd(cdc$height))
fake_df = data.frame(label="fake",height = fakes)
real_df = data.frame(label="real",height = cdc$height)
head(fake_df)
## label height
## 1 fake 62.48077
## 2 fake 66.41947
## 3 fake 64.56457
## 4 fake 68.17673
## 5 fake 68.10438
## 6 fake 60.30181
head(real_df)
## label height
## 1 real 70
## 2 real 64
## 3 real 60
## 4 real 66
## 5 real 61
## 6 real 64
both = rbind(fake_df,real_df)
str(both)
## 'data.frame': 40000 obs. of 2 variables:
## $ label : chr "fake" "fake" "fake" "fake" ...
## $ height: num 62.5 66.4 64.6 68.2 68.1 ...
Compare the fake and real data.
ggplot(data=both) + geom_density(aes(x = height,color = label))
Could this be because we have a mixture of genders?
gender_plot = ggplot(data=cdc) + geom_density(aes(x = height,color = gender),adjust = 1.5)
ggplotly(gender_plot)
The distribution of male heights has a flat peak