STOR 390
1/31/17
library(tidyverse)
data <- read_csv(url("http://ryanthornburg.com/wp-content/uploads/2015/05/UNC_Salares_NandO_2015-05-06.csv"))
Generate questions about your data.
Search for answers by visualizing, transforming, and modelling your data.
Use what you learn to refine your questions and/or generate new questions.
EDA is fundamentally a creative process. And like most creative processes, the key to asking quality questions is to generate a large quantity of questions.
median(data$totalsal)
[1] 59342
max(data$totalsal)
[1] 819069
# plot each data point
ggplot(data=data) +
geom_point(aes(x=totalsal, y=0)) +
ylim(-10, 10)
# same plot as above but with random y values
ggplot(data=data) +
geom_jitter(aes(x=totalsal, y=0)) +
ylim(-10, 10)
ggplot(data=data) +
geom_boxplot(aes(x=0, y=totalsal))
ggplot(data=data) +
geom_histogram(aes(x=totalsal), bins = 30)
ggplot(data=data) +
geom_histogram(aes(x=totalsal), bins = 10000)
ggplot(data=data) +
geom_histogram(aes(x=totalsal), bins = 2)
ggplot(data=data) +
geom_histogram(aes(x=totalsal), bins = 100)
set.seed(342)
mix <- tibble(val=c(rnorm(n=4000, mean=0, sd=1),
rnorm(n=4000, mean=2.5, sd=1)))
# wide binwidth
ggplot(data=mix) +
geom_histogram(aes(x=val), bins = 10)
# moderate binwidth
ggplot(data=mix) +
geom_histogram(aes(x=val), bins = 30)
# small binwidth
ggplot(data=mix) +
geom_histogram(aes(x=val), bins = 2000)
# geom_density with its default values
ggplot(data=data) +
geom_density(aes(x=totalsal), kernel="gaussian", adjust=1)
# geom_density with a fat window
ggplot(data=data) +
geom_density(aes(x=totalsal), kernel="gaussian", adjust=10)
# geom_density with a skinny window
ggplot(data=data) +
geom_density(aes(x=totalsal), kernel="gaussian", adjust=.1)
ggplot(data=data) +
geom_point(aes(x=age, y=totalsal))
cor(data$age, data$totalsal)
[1] 0.2355144
data %>%
filter(dept %in% c("Pediatrics", "Orthodontics" , 'Ophthalmology')) %>%
group_by(dept) %>%
summarise(mean_sal = mean(totalsal)) %>%
ggplot() +
geom_bar(aes(x=dept, y=mean_sal), stat='identity')
data %>%
filter(dept %in% c("Pediatrics", "Orthodontics" , 'Ophthalmology')) %>%
ggplot() +
geom_boxplot(aes(x=dept, y=totalsal)) +
coord_flip() # max the labels horizontal so people can read them!
ggplot(data = faithful) +
geom_point(mapping = aes(x = eruptions, y = waiting))
ggplot(data=data) +
geom_point(aes(x=age, y=totalsal, color=status))