Startup

Tidyverse to process and handle most plots. rbokeh to process plot 5. Setting wd to where all class files are.

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.8     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.1
## ✔ readr   2.1.2     ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(rbokeh)
setwd("~/Data Study/Data110 MC/Week 1")

airquality <- airquality

EDA - Exploratory Data Analysis

The first few aspects are a copy of the sample description for this project. The later portion speaks on N/A handling. After confirmation, none of the p1-p4 plots use data within the N/A cols.

# Summary and general statistics
mean(airquality$Temp)
## [1] 77.88235
mean(airquality[,4])
## [1] 77.88235
median(airquality$Temp)
## [1] 79
sd(airquality$Wind)
## [1] 3.523001
var(airquality$Wind)
## [1] 12.41154
#summary of data structure
str(airquality)
## 'data.frame':    153 obs. of  6 variables:
##  $ Ozone  : int  41 36 12 18 NA 28 23 19 8 NA ...
##  $ Solar.R: int  190 118 149 313 NA NA 299 99 19 194 ...
##  $ Wind   : num  7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
##  $ Temp   : int  67 72 74 62 56 66 65 59 61 69 ...
##  $ Month  : int  5 5 5 5 5 5 5 5 5 5 ...
##  $ Day    : int  1 2 3 4 5 6 7 8 9 10 ...
head(airquality)
##   Ozone Solar.R Wind Temp Month Day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 5    NA      NA 14.3   56     5   5
## 6    28      NA 14.9   66     5   6
# Relabel and re-review
airquality$Month[airquality$Month == 5]<- "May"
airquality$Month[airquality$Month == 6]<- "June"
airquality$Month[airquality$Month == 7]<- "July"
airquality$Month[airquality$Month == 8]<- "August"
airquality$Month[airquality$Month == 9]<- "September"
#review summary of data structure
str(airquality)
## 'data.frame':    153 obs. of  6 variables:
##  $ Ozone  : int  41 36 12 18 NA 28 23 19 8 NA ...
##  $ Solar.R: int  190 118 149 313 NA NA 299 99 19 194 ...
##  $ Wind   : num  7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
##  $ Temp   : int  67 72 74 62 56 66 65 59 61 69 ...
##  $ Month  : chr  "May" "May" "May" "May" ...
##  $ Day    : int  1 2 3 4 5 6 7 8 9 10 ...
head(airquality)
##   Ozone Solar.R Wind Temp Month Day
## 1    41     190  7.4   67   May   1
## 2    36     118  8.0   72   May   2
## 3    12     149 12.6   74   May   3
## 4    18     313 11.5   62   May   4
## 5    NA      NA 14.3   56   May   5
## 6    28      NA 14.9   66   May   6
#re-review summary statistics
summary(airquality)
##      Ozone           Solar.R           Wind             Temp      
##  Min.   :  1.00   Min.   :  7.0   Min.   : 1.700   Min.   :56.00  
##  1st Qu.: 18.00   1st Qu.:115.8   1st Qu.: 7.400   1st Qu.:72.00  
##  Median : 31.50   Median :205.0   Median : 9.700   Median :79.00  
##  Mean   : 42.13   Mean   :185.9   Mean   : 9.958   Mean   :77.88  
##  3rd Qu.: 63.25   3rd Qu.:258.8   3rd Qu.:11.500   3rd Qu.:85.00  
##  Max.   :168.00   Max.   :334.0   Max.   :20.700   Max.   :97.00  
##  NA's   :37       NA's   :7                                       
##     Month                Day      
##  Length:153         Min.   : 1.0  
##  Class :character   1st Qu.: 8.0  
##  Mode  :character   Median :16.0  
##                     Mean   :15.8  
##                     3rd Qu.:23.0  
##                     Max.   :31.0  
## 
#order the months
airquality$Month<-factor(airquality$Month, levels=c("May", "June","July", "August", "September"))

Data Wrangling

# check for N/A
na.cols <- which(colSums(is.na(airquality)) >0)
sort(colSums(sapply(airquality[na.cols], is.na)),decreasing = TRUE)
##   Ozone Solar.R 
##      37       7
paste('Number of columns with no values:', length(na.cols))
## [1] "Number of columns with no values: 2"
which(is.na(airquality), arr.ind=TRUE)
##       row col
##  [1,]   5   1
##  [2,]  10   1
##  [3,]  25   1
##  [4,]  26   1
##  [5,]  27   1
##  [6,]  32   1
##  [7,]  33   1
##  [8,]  34   1
##  [9,]  35   1
## [10,]  36   1
## [11,]  37   1
## [12,]  39   1
## [13,]  42   1
## [14,]  43   1
## [15,]  45   1
## [16,]  46   1
## [17,]  52   1
## [18,]  53   1
## [19,]  54   1
## [20,]  55   1
## [21,]  56   1
## [22,]  57   1
## [23,]  58   1
## [24,]  59   1
## [25,]  60   1
## [26,]  61   1
## [27,]  65   1
## [28,]  72   1
## [29,]  75   1
## [30,]  83   1
## [31,]  84   1
## [32,] 102   1
## [33,] 103   1
## [34,] 107   1
## [35,] 115   1
## [36,] 119   1
## [37,] 150   1
## [38,]   5   2
## [39,]   6   2
## [40,]  11   2
## [41,]  27   2
## [42,]  96   2
## [43,]  97   2
## [44,]  98   2
# What can we do with these N/A? Is it necessary for analysis?

Graphics

p1 <- qplot(data = airquality,Temp,color = Month,geom = "histogram", bins = 20)
p1

p2 <- airquality %>%
  ggplot(aes(x=Temp, fill=Month)) +
  geom_histogram(position="identity", alpha=0.5, binwidth = 5, color = "black")+
  scale_fill_discrete(name = "Month", labels = c("May", "June","July", "August", "September"))
p2

p3 <- airquality %>%
  ggplot(aes(Month, Temp, fill = Month)) + 
  ggtitle("Temperatures") +
  xlab("Monthly Temperatures") +
  ylab("Frequency") +
  geom_boxplot() +
  scale_fill_discrete(name = "Month", labels = c("May", "June","July", "August", "September"))
p3 

p4 <- airquality %>%
  ggplot(aes(Month, Temp, fill = Month)) + 
  ggtitle("Monthly Temperature Variations") +
  xlab("Monthly Temperatures") +
  ylab("Frequency") +
  geom_boxplot()+
  scale_fill_grey(name = "Month", labels = c("May", "June","July", "August", "September"))
p4

Challenge Myself with Different Plots

## Plot 5 - My own plot

#test of color ramp
#n <- nrow(airquality)
#ramp <- colorRampPalette(c("green","blue","yellow"))(n)

#Create a rbokeh plot
p5 <- figure() %>%
  ly_points(Temp, Wind, data = airquality, color = Month, glyph = Month)
p5
#recreate the same rbokeh graph into a ggplot
p6 <- airquality %>%
  ggplot(aes(Temp, Wind, shape = Month, color = Month, alpha = 0.5)) +
  geom_point(size = 4) + 
  scale_shape_manual(values=c(16,15,17,18,10))+
  scale_color_manual(values=c('blue','orange','green','red','purple'))
p6

Discussion on my Graph

“Be sure to write a brief essay that describes the plot you have created, what the plot shows, and what code you used to make this modification.”

I created a simple graph that uses the rbokeh package. Through that package, I made the color and point shapes to be based on month. After that completed, I created a more commonly used package, ggplot2, to re-create this same rbokeh plot. I had to make manual edits to the shapes and colors to match the rbokeh plot as much as possible.