# install.packages("tidyverse")
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.7
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.1.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
airquality <- airquality
# airquality
# datasets::airquality
str(airquality)
## 'data.frame': 153 obs. of 6 variables:
## $ Ozone : int 41 36 12 18 NA 28 23 19 8 NA ...
## $ Solar.R: int 190 118 149 313 NA NA 299 99 19 194 ...
## $ Wind : num 7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
## $ Temp : int 67 72 74 62 56 66 65 59 61 69 ...
## $ Month : int 5 5 5 5 5 5 5 5 5 5 ...
## $ Day : int 1 2 3 4 5 6 7 8 9 10 ...
# mean(Temp) ## Error in mean(Temp) : object 'Temp' not found
mean(airquality$Temp) # by name
## [1] 77.88235
mean(airquality[,4]) # by column
## [1] 77.88235
# Calculate Median, Standard Deviation, and Variance
median(airquality$Temp)
## [1] 79
sd(airquality$Wind)
## [1] 3.523001
var(airquality$Wind)
## [1] 12.41154
# Change the Months from 5 - 9 to May through September
airquality$Month[airquality$Month == 5]<- "May" # "airquality$Month == 5" as if
airquality$Month[airquality$Month == 6]<- "June"
airquality$Month[airquality$Month == 7]<- "July"
airquality$Month[airquality$Month == 8]<- "August"
airquality$Month[airquality$Month == 9]<- "September"
str(airquality) # check the previous changes
## 'data.frame': 153 obs. of 6 variables:
## $ Ozone : int 41 36 12 18 NA 28 23 19 8 NA ...
## $ Solar.R: int 190 118 149 313 NA NA 299 99 19 194 ...
## $ Wind : num 7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
## $ Temp : int 67 72 74 62 56 66 65 59 61 69 ...
## $ Month : chr "May" "May" "May" "May" ...
## $ Day : int 1 2 3 4 5 6 7 8 9 10 ...
# Look at the summary statistics of the dataset, and see how Month has changed to have characters instead of numbers. pay attention to "Month"
summary(airquality)
## Ozone Solar.R Wind Temp
## Min. : 1.00 Min. : 7.0 Min. : 1.700 Min. :56.00
## 1st Qu.: 18.00 1st Qu.:115.8 1st Qu.: 7.400 1st Qu.:72.00
## Median : 31.50 Median :205.0 Median : 9.700 Median :79.00
## Mean : 42.13 Mean :185.9 Mean : 9.958 Mean :77.88
## 3rd Qu.: 63.25 3rd Qu.:258.8 3rd Qu.:11.500 3rd Qu.:85.00
## Max. :168.00 Max. :334.0 Max. :20.700 Max. :97.00
## NA's :37 NA's :7
## Month Day
## Length:153 Min. : 1.0
## Class :character 1st Qu.: 8.0
## Mode :character Median :16.0
## Mean :15.8
## 3rd Qu.:23.0
## Max. :31.0
##
airquality$Month<-factor(airquality$Month, levels=c("May", "June","July", "August", "September"))
This is Plot 1
There are mutiple chunks in the Plot 1, from Plot 1-1 to Plot 1-6
# Plot 1-1 (copy from the instructor): Create a histogram categorized by Month with qplot
# Qplot stands for “Quick-Plot” (in the ggplot2 package)
# qplot(data = airquality,Temp,fill = Month,geom = "histogram", bins = 20)
p1 <- qplot(data = airquality,Temp,fill = Month,geom = "histogram", bins = 20)
p1

# print(p1) ## in Python
# Plot 1-2 (my study 1) Simple histogram
Temperature <- airquality$Temp
hist(Temperature)

# Plot 1-3 (my study 2) Histogram with added parameters
hist(Temperature,
main="Maximum daily temperature at La Guardia Airport",
xlab="Temperature in degrees Fahrenheit",
xlim=c(40,120), # can be replaced by other range, e.g. (50,100)
col="yellow", # can be replaced by other colors, such as darkmagenta
freq=TRUE # can be replaced by FALSE
)

# Plot 1-4 (my study 3) The hist() function returns a list with 6 components.
h <- hist(Temperature)

h
## $breaks
## [1] 55 60 65 70 75 80 85 90 95 100
##
## $counts
## [1] 8 10 15 19 33 34 20 12 2
##
## $density
## [1] 0.010457516 0.013071895 0.019607843 0.024836601 0.043137255 0.044444444
## [7] 0.026143791 0.015686275 0.002614379
##
## $mids
## [1] 57.5 62.5 67.5 72.5 77.5 82.5 87.5 92.5 97.5
##
## $xname
## [1] "Temperature"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
# Plot 1-5 (my study 4) Use Histogram return values for labels using text()
h <- hist(Temperature,ylim=c(0,50))
text(h$mids,h$counts,labels=h$counts, adj=c(0.5, -0.5))

# Plot 1-6 (my study 5) Histogram with different breaks
hist(Temperature, breaks=4, main="With breaks=5")

hist(Temperature, breaks=20, main="With breaks=30")

This is Plot 2
There are mutiple chunks in the Plot 1, from Plot 2-1 to Plot 2-2
# Plot 2-1(copy from the instructor): Make a histogram using ggplot
p2 <- airquality %>%
ggplot(aes(x=Temp, fill=Month)) +
geom_histogram(position="identity", alpha=0.5, binwidth = 5, color = "white")+
scale_fill_discrete(name = "Month", labels = c("May", "June","July", "August", "September"))
p2

# Plot 2-2 (my study 1): Make a histogram using ggplot and change same values of the parameters
p2 <- airquality %>%
ggplot(aes(x=Temp, fill=Month)) +
geom_histogram(position="identity", alpha=0.5, binwidth = 1, color = "black")+
scale_fill_discrete(name = "Month", labels = c("May", "June","July", "August", "September"))
p2

This is Plot 3
There are mutiple chunks in the Plot 3, from Plot 3-1 to Plot 3-4
# Plot 3-1(copy from the instructor): Create side-by-side boxplots categorized by Month
p3 <- airquality %>%
ggplot(aes(Month, Temp, fill = Month)) +
ggtitle("Temperatures") +
xlab("Monthly Temperatures") +
ylab("Frequency") +
geom_boxplot() +
scale_fill_discrete(name = "Month", labels = c("May", "June","July", "August", "September"))
p3

# Plot 3-2 (my study 1): Boxplot form Formula
boxplot(Temp~Month,
data=airquality,
main="Different boxplots for each month",
xlab="Month Name",
ylab="Degree Fahrenheit",
col="red",
border="black"
)

# Plot 3-3 (my study 2): Make a boxplot for the ozone readings
boxplot(airquality$Ozone)

# Make a boxplot for the mutiple readings
boxplot(airquality$Ozone, airquality$Wind,airquality$Temp,airquality$Month)

# Plot 3-4 (my study 3): The boxplot() function returns a list with 6 components
b <- boxplot(airquality$Ozone)

b
## $stats
## [,1]
## [1,] 1.0
## [2,] 18.0
## [3,] 31.5
## [4,] 63.5
## [5,] 122.0
##
## $n
## [1] 116
##
## $conf
## [,1]
## [1,] 24.82518
## [2,] 38.17482
##
## $out
## [1] 135 168
##
## $group
## [1] 1 1
##
## $names
## [1] "1"
This is Plot 4
There are mutiple chunks in the Plot 4, from Plot 4-1 to Plot 4-3
# Plot 4-1(copy from the instructor): Make the same side-by-side boxplots, but in grey-scale
p4 <- airquality %>%
ggplot(aes(Month, Temp, fill = Month)) +
ggtitle("Monthly Temperature Variations") +
xlab("Monthly Temperatures") +
ylab("Frequency") +
geom_boxplot()+
scale_fill_grey(name = "Month", labels = c("May", "June","July", "August", "September"))
p4

# Plot 4-2(my study 1): Make Side-by-Side Boxplots
boxplot(airquality$Temp ~ airquality$Month,
col='steelblue',
main='Temp by Month',
xlab='Month',
ylab='Temp')

# Make Side-by-Side Boxplots and horizontalis TRUE
boxplot(airquality$Temp ~ airquality$Month,
col='green',
main='Temp by Month',
xlab='Month',
ylab='Temp',
horizontal=TRUE)

# Plot 4-3(my study 2): Make the vertical side-by-side boxplots in ggplot2
ggplot(airquality, aes(x=Month, y=Temperature, fill=Month)) +
geom_boxplot() +
ggtitle('Temperature by Month')

This is Plot 5
There are mutiple chunks in the Plot 5, from Plot 5-1 to Plot 5-2
# Plot 5-1: Now make one plot on your own of any of the variables in this dataset. It may be # a scatterplot, histogram, or boxplot.
# basic scatterplot
plot(x = airquality$Temp,y = airquality$Ozone,
xlab = "Temp",
ylab = "Ozone",
xlim = c(55,70),
ylim = c(5,50),
main = "Temp vs Ozone"
)

# Plot 5-2: Make a scatterplot is plotted for each pair.
pairs(~Month+Ozone+Wind+Temp,data = airquality,
main = "Scatterplot Matrix")
