These are the codes I used to solve the exercises from the R Visualization by edX. Codes are for the Titanic Project and the final assessment.
options(digits = 3) # report 3 significant digits
library(tidyverse)
library(titanic)
head(titanic_train)
## PassengerId Survived Pclass
## 1 1 0 3
## 2 2 1 1
## 3 3 1 3
## 4 4 1 1
## 5 5 0 3
## 6 6 0 3
## Name Sex Age SibSp Parch
## 1 Braund, Mr. Owen Harris male 22 1 0
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38 1 0
## 3 Heikkinen, Miss. Laina female 26 0 0
## 4 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1 0
## 5 Allen, Mr. William Henry male 35 0 0
## 6 Moran, Mr. James male NA 0 0
## Ticket Fare Cabin Embarked
## 1 A/5 21171 7.25 S
## 2 PC 17599 71.28 C85 C
## 3 STON/O2. 3101282 7.92 S
## 4 113803 53.10 C123 S
## 5 373450 8.05 S
## 6 330877 8.46 Q
titanic <- titanic_train %>%
select(Survived, Pclass, Sex, Age, SibSp, Parch, Fare) %>%
mutate(Survived = factor(Survived),
Pclass = factor(Pclass),
Sex = factor(Sex))
lapply(titanic,class) #determine the class of all columns in the df
## $Survived
## [1] "factor"
##
## $Pclass
## [1] "factor"
##
## $Sex
## [1] "factor"
##
## $Age
## [1] "numeric"
##
## $SibSp
## [1] "integer"
##
## $Parch
## [1] "integer"
##
## $Fare
## [1] "numeric"
library(dplyr)
library(ggplot2)
titanic %>% filter(!is.na(Age)) %>% ggplot(aes(Age,fill=Sex)) + geom_density(alpha=0.2,bw=2,position="stack") #first choice
titanic %>% .$Sex %>% table() #third choice (Method 1)
## .
## female male
## 314 577
table(titanic$Sex) #third choice (Method 2)
##
## female male
## 314 577
titanic %>% filter(Age == 40) %>% .$Sex %>% table() #fourth choice
## .
## female male
## 6 7
titanic %>% filter(Age >= 18 & Age <= 35) %>% .$Sex %>% table() #fifth choice (make sure to divide by total count of each sex)
## .
## female male
## 133 251
titanic %>% filter(Age < 17) %>% .$Sex %>% table() #sixth choice (divide by total count of each sex)
## .
## female male
## 49 51
titanic$Sex[which.max(titanic$Age)] #seventh choice
## [1] male
## Levels: female male
params <- titanic %>%
filter(!is.na(Age)) %>%
summarize(mean = mean(Age), sd = sd(Age))
head(params)
## mean sd
## 1 29.7 14.5
titanic %>% ggplot(aes(sample=Age)) + geom_qq(dparams = params) + geom_abline()
titanic %>% ggplot(aes(Survived, fill=Sex)) + geom_bar(position = position_dodge())
titanic %>% group_by(Survived) %>% summarise(n=n()) #less than half survived
## # A tibble: 2 x 2
## Survived n
## <fct> <int>
## 1 0 549
## 2 1 342
titanic %>%
ggplot(aes(Age, y = ..count.., fill = Survived)) +
geom_density(alpha = 0.2, position = "stack")
titanic %>% filter(!Fare == 0) %>% ggplot(aes(Survived,Fare,fill=Survived)) + geom_boxplot(alpha=0.2) + scale_y_continuous(trans="log2") + geom_jitter()
titanic %>% ggplot(aes(Pclass,fill=Survived)) + geom_bar() #first bar plot
titanic %>% ggplot(aes(Pclass,fill=Survived)) + geom_bar(position = position_fill()) #second bar plot
titanic %>% ggplot(aes(Survived,fill=Pclass)) + geom_bar(position = position_fill()) #third bar plot
titanic %>% ggplot(aes(Age,y=..count..,fill=Survived)) + geom_density(alpha = 0.2, position = "stack") + facet_grid(Sex~Pclass)
library(tidyverse)
library(dslabs)
data(stars)
options(digits = 3) # report 3 significant digits
data(stars)
head(stars)
## star magnitude temp type
## 1 Sun 4.8 5840 G
## 2 SiriusA 1.4 9620 A
## 3 Canopus -3.1 7400 F
## 4 Arcturus -0.4 4590 K
## 5 AlphaCentauriA 4.3 5840 G
## 6 Vega 0.5 9900 A
mean(stars$magnitude) #mean magnitude
## [1] 4.26
sd(stars$magnitude) #sd of magnitude
## [1] 7.35
ggplot(stars, aes(magnitude)) + geom_density()
stars %>%
ggplot(aes(temp)) +
geom_density()
stars %>% ggplot(aes(temp,magnitude)) + geom_point()
library(ggrepel)
stars %>% ggplot(aes(temp,magnitude)) + geom_point() + scale_y_reverse() +
scale_x_continuous(trans="log10") + scale_x_reverse() + geom_text(aes(label=star),hjust=-0.15, vjust=1)
stars %>% ggplot(aes(temp,magnitude,color=type)) + geom_point() + scale_y_reverse() +
scale_x_continuous(trans="log10") + scale_x_reverse()
library(tidyverse)
library(dslabs)
data(temp_carbon)
data(greenhouse_gases)
data(historic_co2)
head(temp_carbon)
## year temp_anomaly land_anomaly ocean_anomaly carbon_emissions
## 1 1880 -0.11 -0.48 -0.01 236
## 2 1881 -0.08 -0.40 0.01 243
## 3 1882 -0.10 -0.48 0.00 256
## 4 1883 -0.18 -0.66 -0.04 272
## 5 1884 -0.26 -0.69 -0.14 275
## 6 1885 -0.25 -0.56 -0.17 277
temp_carbon %>%
filter(!is.na(carbon_emissions)) %>%
.$year %>%
max() #second option
## [1] 2014
temp_carbon %>%
filter(!is.na(carbon_emissions)) %>%
.$year %>%
max() #fourth option
## [1] 2014
temp_carbon %>%
filter(!is.na(carbon_emissions)) %>%
select(year) %>%
max() #fifth option
## [1] 2014
temp_carbon %>%
filter(!is.na(carbon_emissions)) %>%
.$year %>%
min()
## [1] 1751
temp_carbon$carbon_emissions[temp_carbon$year == 2014]/temp_carbon$carbon_emissions[which.min(temp_carbon$year)]
## [1] 3285
temp_carbon %>%
filter(!is.na(temp_anomaly)) %>%
.$year %>%
min()
## [1] 1880
temp_carbon %>%
filter(!is.na(temp_anomaly)) %>%
.$year %>%
max()
## [1] 2018
temp1 <- temp_carbon %>%
filter(year == 1880) %>%
.$temp_anomaly
temp2 <- temp_carbon %>%
filter(year == 2018) %>%
.$temp_anomaly
temp2-temp1
## [1] 0.93
p <- temp_carbon %>%
filter(!is.na(temp_anomaly)) %>%
ggplot(aes(year,temp_anomaly)) + geom_line()
p + geom_hline(aes(yintercept = 0), col = "blue") + ylab("Temperature anomaly (degrees C)") +
ggtitle("Temperature anomaly relative to 20th century mean, 1880-2018") +
geom_text(aes(x = 2000, y = 0.05, label = "20th century mean"), col = "blue")
temp_carbon %>%
filter(!is.na(temp_anomaly) & !is.na(ocean_anomaly) & !is.na(land_anomaly)) %>%
ggplot(aes(x=year)) + geom_line(aes(y=temp_anomaly),color="red") + geom_line(aes(y=land_anomaly),color="brown") +
geom_line(aes(y=ocean_anomaly),color="blue") +
geom_hline(aes(yintercept = 0), col = "blue") + ylab("Temperature anomaly (degrees C)") +
ggtitle("Temperature anomaly relative to 20th century mean, 1880-2018") +
geom_text(aes(x = 2000, y = 0.05, label = "20th century mean"), col = "blue")
greenhouse_gases %>%
ggplot(aes(year,concentration)) +
geom_line() +
facet_grid(gas~., scales = "free") +
geom_vline(aes(xintercept = 1850)) +
ylab("Concentration (ch4/n2o ppb, co2 ppm)") +
ggtitle("Atmospheric greenhouse gas concentration by year, 0-2000")
temp_carbon %>% filter(!is.na(carbon_emissions)) %>%
ggplot(aes(year,carbon_emissions)) + geom_line() +geom_vline(aes(xintercept = 1850))
co2_time <- historic_co2 %>% filter(!is.na(co2)) %>%
ggplot(aes(year, co2,color=source)) + geom_line()
co2_time + xlim(-800000,-775000)
co2_time + xlim(-375000,-330000)
co2_time + xlim(-140000,-120000)
co2_time + xlim(-3000,2018)