Introduction

These are the codes I used to solve the exercises from the R Visualization by edX. Codes are for the Titanic Project and the final assessment.

Titanic Project

options(digits = 3)    # report 3 significant digits
library(tidyverse)
library(titanic)

head(titanic_train)
##   PassengerId Survived Pclass
## 1           1        0      3
## 2           2        1      1
## 3           3        1      3
## 4           4        1      1
## 5           5        0      3
## 6           6        0      3
##                                                  Name    Sex Age SibSp Parch
## 1                             Braund, Mr. Owen Harris   male  22     1     0
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female  38     1     0
## 3                              Heikkinen, Miss. Laina female  26     0     0
## 4        Futrelle, Mrs. Jacques Heath (Lily May Peel) female  35     1     0
## 5                            Allen, Mr. William Henry   male  35     0     0
## 6                                    Moran, Mr. James   male  NA     0     0
##             Ticket  Fare Cabin Embarked
## 1        A/5 21171  7.25              S
## 2         PC 17599 71.28   C85        C
## 3 STON/O2. 3101282  7.92              S
## 4           113803 53.10  C123        S
## 5           373450  8.05              S
## 6           330877  8.46              Q
titanic <- titanic_train %>%
    select(Survived, Pclass, Sex, Age, SibSp, Parch, Fare) %>%
    mutate(Survived = factor(Survived),
           Pclass = factor(Pclass),
           Sex = factor(Sex))

lapply(titanic,class) #determine the class of all columns in the df
## $Survived
## [1] "factor"
## 
## $Pclass
## [1] "factor"
## 
## $Sex
## [1] "factor"
## 
## $Age
## [1] "numeric"
## 
## $SibSp
## [1] "integer"
## 
## $Parch
## [1] "integer"
## 
## $Fare
## [1] "numeric"

Question 2: Demographics of Titanic passengers

library(dplyr)
library(ggplot2)

titanic %>% filter(!is.na(Age)) %>% ggplot(aes(Age,fill=Sex)) + geom_density(alpha=0.2,bw=2,position="stack") #first choice

titanic %>% .$Sex %>% table() #third choice (Method 1)
## .
## female   male 
##    314    577
table(titanic$Sex) #third choice (Method 2)
## 
## female   male 
##    314    577
titanic %>% filter(Age == 40) %>% .$Sex %>% table() #fourth choice
## .
## female   male 
##      6      7
titanic %>% filter(Age >= 18 & Age <= 35) %>% .$Sex %>% table() #fifth choice (make sure to divide by total count of each sex)
## .
## female   male 
##    133    251
titanic %>% filter(Age < 17) %>% .$Sex %>% table() #sixth choice (divide by total count of each sex)
## .
## female   male 
##     49     51
titanic$Sex[which.max(titanic$Age)] #seventh choice
## [1] male
## Levels: female male

Question 3: QQ-plot of Age Distribtion

params <- titanic %>%
    filter(!is.na(Age)) %>%
    summarize(mean = mean(Age), sd = sd(Age))
head(params)
##   mean   sd
## 1 29.7 14.5
titanic %>% ggplot(aes(sample=Age)) + geom_qq(dparams = params) + geom_abline()

Question 4: Survival by Sex

titanic %>% ggplot(aes(Survived, fill=Sex)) + geom_bar(position = position_dodge())

titanic %>% group_by(Survived) %>% summarise(n=n()) #less than half survived
## # A tibble: 2 x 2
##   Survived     n
##   <fct>    <int>
## 1 0          549
## 2 1          342

Question 5: Survival by Age

titanic %>%
    ggplot(aes(Age, y = ..count.., fill = Survived)) +
    geom_density(alpha = 0.2, position = "stack") 

Question 6: Survival by Fare

titanic %>% filter(!Fare == 0) %>% ggplot(aes(Survived,Fare,fill=Survived)) + geom_boxplot(alpha=0.2) + scale_y_continuous(trans="log2") + geom_jitter()

Question 7: Survival by Passenger Class

titanic %>% ggplot(aes(Pclass,fill=Survived)) + geom_bar() #first bar plot

titanic %>% ggplot(aes(Pclass,fill=Survived)) + geom_bar(position = position_fill()) #second bar plot

titanic %>% ggplot(aes(Survived,fill=Pclass)) + geom_bar(position = position_fill()) #third bar plot

Question 8: Survival by Age, Sex, and Passenger Class

titanic %>% ggplot(aes(Age,y=..count..,fill=Survived)) + geom_density(alpha = 0.2, position = "stack") + facet_grid(Sex~Pclass)

Final Assessment

library(tidyverse)
library(dslabs)
data(stars)
options(digits = 3)   # report 3 significant digits

Question 1

data(stars)
head(stars)
##             star magnitude temp type
## 1            Sun       4.8 5840    G
## 2        SiriusA       1.4 9620    A
## 3        Canopus      -3.1 7400    F
## 4       Arcturus      -0.4 4590    K
## 5 AlphaCentauriA       4.3 5840    G
## 6           Vega       0.5 9900    A
mean(stars$magnitude) #mean magnitude
## [1] 4.26
sd(stars$magnitude) #sd of magnitude
## [1] 7.35

Question 2

ggplot(stars, aes(magnitude)) + geom_density()

Question 3

stars %>%
    ggplot(aes(temp)) +
    geom_density()

Question 4

stars %>% ggplot(aes(temp,magnitude)) + geom_point()

Question 5 and 8

library(ggrepel)
stars %>% ggplot(aes(temp,magnitude)) + geom_point() + scale_y_reverse() +
  scale_x_continuous(trans="log10") + scale_x_reverse() + geom_text(aes(label=star),hjust=-0.15, vjust=1)

Question 9

stars %>% ggplot(aes(temp,magnitude,color=type)) + geom_point() + scale_y_reverse() +
  scale_x_continuous(trans="log10") + scale_x_reverse()

Climate Change

library(tidyverse)
library(dslabs)
data(temp_carbon)
data(greenhouse_gases)
data(historic_co2)

Question 1

head(temp_carbon)
##   year temp_anomaly land_anomaly ocean_anomaly carbon_emissions
## 1 1880        -0.11        -0.48         -0.01              236
## 2 1881        -0.08        -0.40          0.01              243
## 3 1882        -0.10        -0.48          0.00              256
## 4 1883        -0.18        -0.66         -0.04              272
## 5 1884        -0.26        -0.69         -0.14              275
## 6 1885        -0.25        -0.56         -0.17              277
temp_carbon %>%
    filter(!is.na(carbon_emissions)) %>%
    .$year %>%
    max() #second option
## [1] 2014
temp_carbon %>%
    filter(!is.na(carbon_emissions)) %>%
    .$year %>%
    max() #fourth option
## [1] 2014
temp_carbon %>%
    filter(!is.na(carbon_emissions)) %>%
    select(year) %>%
    max() #fifth option
## [1] 2014

Question 2

temp_carbon %>%
    filter(!is.na(carbon_emissions)) %>%
    .$year %>%
    min()
## [1] 1751
temp_carbon$carbon_emissions[temp_carbon$year == 2014]/temp_carbon$carbon_emissions[which.min(temp_carbon$year)]
## [1] 3285

Question 3

temp_carbon %>%
    filter(!is.na(temp_anomaly)) %>%
    .$year %>%
    min()
## [1] 1880
temp_carbon %>%
    filter(!is.na(temp_anomaly)) %>%
    .$year %>%
    max()
## [1] 2018
temp1 <- temp_carbon %>%
    filter(year == 1880) %>%
    .$temp_anomaly

temp2 <- temp_carbon %>%
    filter(year == 2018) %>%
    .$temp_anomaly

temp2-temp1
## [1] 0.93

Question 4

p <- temp_carbon %>% 
  filter(!is.na(temp_anomaly)) %>%
  ggplot(aes(year,temp_anomaly)) + geom_line()

p + geom_hline(aes(yintercept = 0), col = "blue") + ylab("Temperature anomaly (degrees C)") +
    ggtitle("Temperature anomaly relative to 20th century mean, 1880-2018") +
    geom_text(aes(x = 2000, y = 0.05, label = "20th century mean"), col = "blue")

Question 7

temp_carbon %>% 
  filter(!is.na(temp_anomaly) & !is.na(ocean_anomaly) & !is.na(land_anomaly)) %>%
  ggplot(aes(x=year)) + geom_line(aes(y=temp_anomaly),color="red") + geom_line(aes(y=land_anomaly),color="brown") + 
  geom_line(aes(y=ocean_anomaly),color="blue") +
  geom_hline(aes(yintercept = 0), col = "blue") + ylab("Temperature anomaly (degrees C)") +
    ggtitle("Temperature anomaly relative to 20th century mean, 1880-2018") +
    geom_text(aes(x = 2000, y = 0.05, label = "20th century mean"), col = "blue")

Question 8

greenhouse_gases %>%
    ggplot(aes(year,concentration)) +
    geom_line() +
    facet_grid(gas~., scales = "free") +
    geom_vline(aes(xintercept = 1850)) +
    ylab("Concentration (ch4/n2o ppb, co2 ppm)") +
    ggtitle("Atmospheric greenhouse gas concentration by year, 0-2000")

Question 10

temp_carbon %>% filter(!is.na(carbon_emissions)) %>%
  ggplot(aes(year,carbon_emissions)) + geom_line() +geom_vline(aes(xintercept = 1850))

Question 11 and 12

co2_time <- historic_co2 %>% filter(!is.na(co2)) %>%
  ggplot(aes(year, co2,color=source)) + geom_line()

co2_time + xlim(-800000,-775000)

co2_time + xlim(-375000,-330000)

co2_time + xlim(-140000,-120000)

co2_time + xlim(-3000,2018)