“I have neither given nor received unauthorized assistance on this assignment.”

Question 1 a) iris is a data set that contains 150 observations of 5 variables. The variable names are Sepal.Length,Sepal.Width, Petal.Length, Petal.Width, amd species. The variables contain different measurments of the dimensions of varios species of iris.

rm(list=ls())
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#A)
library(datasets)
data(iris)
#B)
summary.matrix = matrix(data = NA, nrow = 3, ncol = 8)
iris.setosa = subset(iris, iris$Species == "setosa")
iris.versicolor = subset(iris, iris$Species == "versicolor")
iris.virginica = subset(iris, iris$Species == "virginica")
#ROW 1
summary.matrix[1,1] = mean(iris.setosa[,1], na.rm = TRUE)
summary.matrix[1,2] = mean(iris.setosa[,2], na.rm = TRUE)
summary.matrix[1,3] = mean(iris.setosa[,3], na.rm = TRUE)
summary.matrix[1,4] = mean(iris.setosa[,4], na.rm = TRUE)
summary.matrix[1,5] = sd(iris.setosa[,1], na.rm = TRUE)
summary.matrix[1,6] = sd(iris.setosa[,2], na.rm = TRUE)
summary.matrix[1,7] = sd(iris.setosa[,3], na.rm = TRUE)
summary.matrix[1,8] = sd(iris.setosa[,4], na.rm = TRUE)
#ROW 2
summary.matrix[2,1] = mean(iris.versicolor[,1], na.rm = TRUE)
summary.matrix[2,2] = mean(iris.versicolor[,2], na.rm = TRUE)
summary.matrix[2,3] = mean(iris.versicolor[,3], na.rm = TRUE)
summary.matrix[2,4] = mean(iris.versicolor[,4], na.rm = TRUE)
summary.matrix[2,5] = sd(iris.versicolor[,1], na.rm = TRUE)
summary.matrix[2,6] = sd(iris.versicolor[,2], na.rm = TRUE)
summary.matrix[2,7] = sd(iris.versicolor[,3], na.rm = TRUE)
summary.matrix[2,8] = sd(iris.versicolor[,4], na.rm = TRUE)
#ROW 3
summary.matrix[3,1] = mean(iris.virginica[,1], na.rm = TRUE)
summary.matrix[3,2] = mean(iris.virginica[,2], na.rm = TRUE)
summary.matrix[3,3] = mean(iris.virginica[,3], na.rm = TRUE)
summary.matrix[3,4] = mean(iris.virginica[,4], na.rm = TRUE)
summary.matrix[3,5] = sd(iris.virginica[,1], na.rm = TRUE)
summary.matrix[3,6] = sd(iris.virginica[,2], na.rm = TRUE)
summary.matrix[3,7] = sd(iris.virginica[,3], na.rm = TRUE)
summary.matrix[3,8] = sd(iris.virginica[,4], na.rm = TRUE)

rownames(summary.matrix) <- c("Setosa", "Versicolor", "Virginica")
colnames(summary.matrix) <- c("Mean Sepal Length","Mean Sepal Width", "Mean Petal Length", "Mean Petal Width", "Sepal Length S.D", "Sepal Width S.D", "Petal Length S.D", "Petal Width S.D" )

#C)
#i)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.6.2
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 3.6.2
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(grid)
library(lattice)
ggplot(iris, aes(x = Sepal.Length,
                 y = Sepal.Width,
                 color = Species)) + geom_point() -> p1

#ii)
ggplot(iris, aes(x = Petal.Length,
                 y = Petal.Width,
                 color = Species)) + geom_point()-> p2
grid.arrange(p1, p2)

1 C)
Overall there is a positive correlation between petal/sepal length and petal/sepal width. The correlation between petal length and width is much stronger than that of sepal length and width. Setosa shows the widest sepals overall and virginica shows the longest. The heights of virginica and versicolor sepals overlap. petal length and width is increasing order is setosa, versicolor, and virginica.

Question 2

```r
library(dplyr)
nc_bike_crash <- read.csv("C:/Users/laith/Desktop/CMDA/nc_bike_crash.csv", sep=";")
nc_bike_crash = select(nc_bike_crash, Bike_Age, County, Drvr_Age)
summary.matrix = matrix(NA, nrow = 100, ncol =5)
county.names = unique(nc_bike_crash$County)

rownames(summary.matrix) = county.names
for (i in 1:length(county.names)){
  crash_subset = subset(nc_bike_crash, County == county.names[i])
  summary.matrix[i,1] = sum(crash_subset$County == county.names[i])
  summary.matrix[i,2] = mean(crash_subset[,1], na.rm = TRUE)
  summary.matrix[i,3] = max(crash_subset[,1], na.rm = TRUE)
  summary.matrix[i,4] = mean(crash_subset[,3], na.rm = TRUE)
  summary.matrix[i,5] = max(crash_subset[,3], na.rm = TRUE)
}
## Warning in max(crash_subset[, 3], na.rm = TRUE): no non-missing arguments to
## max; returning -Inf
colnames(summary.matrix) <- c("No. Crashes", "Mean Biker Age", "Max. Biker Age", "Mean Driver Age", "Max Driver Age")

summary.matrix <- summary.matrix[order(-summary.matrix[,1]),]

summary.matrix = summary.matrix[(1:14),]

summary.matrix <- cbind(summary.matrix, c(1072000,804006, 526593, 227198, 311640, 332546, 257607, 144946, 376320, 220182, 179042, 132606, 193893, 93991))


colnames(summary.matrix) <- c("No. Crashes", "Mean Biker Age", "Max. Biker Age", "Mean Driver Age", "Max Driver Age", "Population")

summary.matrix
##             No. Crashes Mean Biker Age Max. Biker Age Mean Driver Age
## Wake                757       29.30458             69        37.77033
## Mecklenburg         695       31.81132             69        39.22184
## Guilford            384       30.09309             69        37.00649
## New Hanover         373       31.94693             69        39.01987
## Durham              253       29.63855             63        40.28000
## Cumberland          221       31.89573             67        39.29213
## Buncombe            157       29.25490             69        39.54237
## Orange              139       31.21324             65        38.95575
## Forsyth             123       32.64463             69        39.97917
## Gaston              113       27.50459             69        38.05682
## Pitt                109       29.86916             65        38.70213
## Robeson             105       36.48571             65        40.79518
## Onslow              101       30.30000             64        40.40244
## Nash                100       31.96907             68        37.42308
##             Max Driver Age Population
## Wake                    70    1072000
## Mecklenburg             70     804006
## Guilford                70     526593
## New Hanover             70     227198
## Durham                  70     311640
## Cumberland              70     332546
## Buncombe                70     257607
## Orange                  70     144946
## Forsyth                 70     376320
## Gaston                  70     220182
## Pitt                    70     179042
## Robeson                 70     132606
## Onslow                  70     193893
## Nash                    70      93991

Question 3

library(ggplot2)
load("C:/Users/laith/Desktop/CMDA/ncbikeday1.RData")


loc = subset(loc, lat > 0)
p = ggplot(loc, aes(x = long, y = lat)) 



#Top 10 largest cities in NC according to http://worldpopulationreview.com/states/north-carolina-population/cities/
city.names <- c("Charlotte", "Raleigh", "Greensboro","Durham", "Winston-Salem", "Fayetteville","Cary","Wilmington","High Point", "Greenville")
Latitude <- c(35.227,35.772,36.073,35.994,36.1,35.053,35.792,
              34.226,35.956,35.613)
Longitude <- c(-80.843,-78.639,-79.792,-78.899,-80.244,-78.878,
               -78.781,-77.945,-80.005,-77.366)
top10 <- data.frame("CityName" = city.names, "lat" = Latitude, "long" = Longitude)

p + geom_point() + geom_point(data = top10, aes(x = long, y = lat, color = "Major City"))

Question 4

library(ggplot2)
library(gridExtra)
library(grid)
library(lattice)
nc_bike_crash <- read.csv("C:/Users/laith/Desktop/CMDA/nc_bike_crash.csv", sep=";")
ggplot(nc_bike_crash, aes(x=Bike_Injur, y=Bike_Age)) + geom_boxplot() -> p1

ggplot(nc_bike_crash, aes(x=Drvr_Injur, y=Drvr_Age)) + geom_boxplot() -> p2

grid.arrange(p1, p2)
## Warning: Removed 130 rows containing non-finite values (stat_boxplot).
## Warning: Removed 1096 rows containing non-finite values (stat_boxplot).

Question 6

library(gcookbook)
data(uspopage)
library(ggplot2)
library(gridExtra)
library(grid)
library(lattice)

ggplot(uspopage, aes(x = Year, y = Thousands, fill = AgeGroup)) + geom_area() -> p1

ggplot(uspopage, aes(x = Year, y = Thousands, fill = AgeGroup)) +
  geom_area(position = "fill", colour = "black", size = .2, alpha = .4) +
    scale_y_continuous(labels = scales::percent)-> p2
    
grid.arrange(p1, p2)

uspopage is dataset that shows the estimated values by the us census of the population in the US between 1900-2002. The dataset has three variables, year, agegroup, and population in thousands. there are 834 observartions in the dataset.

Question 7

library(datasets)
data(faithful)
plot(x = faithful$eruptions, y = faithful$waiting)

The faithful dataset stored the waiting time between eruptions and the duration of the eruptions for the old faithful geyser in Yellowstone National Park. It has only two variables, eruption time and waiting time, and 272 observatios for each variable. When plotting the eruptions on the x axis and the waiting time on the y axis, it is apparant that longer eruptions require longer waiting times and vice versa. additionally there are very little data points with eruptions between 2.3 and 3.5 compared to the eruptions between 1.5 and 2.5 and 3.5 and 5.