“I have neither given nor received unauthorized assistance on this assignment.”
Question 1 a) iris is a data set that contains 150 observations of 5 variables. The variable names are Sepal.Length,Sepal.Width, Petal.Length, Petal.Width, amd species. The variables contain different measurments of the dimensions of varios species of iris.
rm(list=ls())
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#A)
library(datasets)
data(iris)
#B)
summary.matrix = matrix(data = NA, nrow = 3, ncol = 8)
iris.setosa = subset(iris, iris$Species == "setosa")
iris.versicolor = subset(iris, iris$Species == "versicolor")
iris.virginica = subset(iris, iris$Species == "virginica")
#ROW 1
summary.matrix[1,1] = mean(iris.setosa[,1], na.rm = TRUE)
summary.matrix[1,2] = mean(iris.setosa[,2], na.rm = TRUE)
summary.matrix[1,3] = mean(iris.setosa[,3], na.rm = TRUE)
summary.matrix[1,4] = mean(iris.setosa[,4], na.rm = TRUE)
summary.matrix[1,5] = sd(iris.setosa[,1], na.rm = TRUE)
summary.matrix[1,6] = sd(iris.setosa[,2], na.rm = TRUE)
summary.matrix[1,7] = sd(iris.setosa[,3], na.rm = TRUE)
summary.matrix[1,8] = sd(iris.setosa[,4], na.rm = TRUE)
#ROW 2
summary.matrix[2,1] = mean(iris.versicolor[,1], na.rm = TRUE)
summary.matrix[2,2] = mean(iris.versicolor[,2], na.rm = TRUE)
summary.matrix[2,3] = mean(iris.versicolor[,3], na.rm = TRUE)
summary.matrix[2,4] = mean(iris.versicolor[,4], na.rm = TRUE)
summary.matrix[2,5] = sd(iris.versicolor[,1], na.rm = TRUE)
summary.matrix[2,6] = sd(iris.versicolor[,2], na.rm = TRUE)
summary.matrix[2,7] = sd(iris.versicolor[,3], na.rm = TRUE)
summary.matrix[2,8] = sd(iris.versicolor[,4], na.rm = TRUE)
#ROW 3
summary.matrix[3,1] = mean(iris.virginica[,1], na.rm = TRUE)
summary.matrix[3,2] = mean(iris.virginica[,2], na.rm = TRUE)
summary.matrix[3,3] = mean(iris.virginica[,3], na.rm = TRUE)
summary.matrix[3,4] = mean(iris.virginica[,4], na.rm = TRUE)
summary.matrix[3,5] = sd(iris.virginica[,1], na.rm = TRUE)
summary.matrix[3,6] = sd(iris.virginica[,2], na.rm = TRUE)
summary.matrix[3,7] = sd(iris.virginica[,3], na.rm = TRUE)
summary.matrix[3,8] = sd(iris.virginica[,4], na.rm = TRUE)
rownames(summary.matrix) <- c("Setosa", "Versicolor", "Virginica")
colnames(summary.matrix) <- c("Mean Sepal Length","Mean Sepal Width", "Mean Petal Length", "Mean Petal Width", "Sepal Length S.D", "Sepal Width S.D", "Petal Length S.D", "Petal Width S.D" )
#C)
#i)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.6.2
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 3.6.2
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(grid)
library(lattice)
ggplot(iris, aes(x = Sepal.Length,
y = Sepal.Width,
color = Species)) + geom_point() -> p1
#ii)
ggplot(iris, aes(x = Petal.Length,
y = Petal.Width,
color = Species)) + geom_point()-> p2
grid.arrange(p1, p2)
1 C)
Overall there is a positive correlation between petal/sepal length and petal/sepal width. The correlation between petal length and width is much stronger than that of sepal length and width. Setosa shows the widest sepals overall and virginica shows the longest. The heights of virginica and versicolor sepals overlap. petal length and width is increasing order is setosa, versicolor, and virginica.
Question 2
```r
library(dplyr)
nc_bike_crash <- read.csv("C:/Users/laith/Desktop/CMDA/nc_bike_crash.csv", sep=";")
nc_bike_crash = select(nc_bike_crash, Bike_Age, County, Drvr_Age)
summary.matrix = matrix(NA, nrow = 100, ncol =5)
county.names = unique(nc_bike_crash$County)
rownames(summary.matrix) = county.names
for (i in 1:length(county.names)){
crash_subset = subset(nc_bike_crash, County == county.names[i])
summary.matrix[i,1] = sum(crash_subset$County == county.names[i])
summary.matrix[i,2] = mean(crash_subset[,1], na.rm = TRUE)
summary.matrix[i,3] = max(crash_subset[,1], na.rm = TRUE)
summary.matrix[i,4] = mean(crash_subset[,3], na.rm = TRUE)
summary.matrix[i,5] = max(crash_subset[,3], na.rm = TRUE)
}
## Warning in max(crash_subset[, 3], na.rm = TRUE): no non-missing arguments to
## max; returning -Inf
colnames(summary.matrix) <- c("No. Crashes", "Mean Biker Age", "Max. Biker Age", "Mean Driver Age", "Max Driver Age")
summary.matrix <- summary.matrix[order(-summary.matrix[,1]),]
summary.matrix = summary.matrix[(1:14),]
summary.matrix <- cbind(summary.matrix, c(1072000,804006, 526593, 227198, 311640, 332546, 257607, 144946, 376320, 220182, 179042, 132606, 193893, 93991))
colnames(summary.matrix) <- c("No. Crashes", "Mean Biker Age", "Max. Biker Age", "Mean Driver Age", "Max Driver Age", "Population")
summary.matrix
## No. Crashes Mean Biker Age Max. Biker Age Mean Driver Age
## Wake 757 29.30458 69 37.77033
## Mecklenburg 695 31.81132 69 39.22184
## Guilford 384 30.09309 69 37.00649
## New Hanover 373 31.94693 69 39.01987
## Durham 253 29.63855 63 40.28000
## Cumberland 221 31.89573 67 39.29213
## Buncombe 157 29.25490 69 39.54237
## Orange 139 31.21324 65 38.95575
## Forsyth 123 32.64463 69 39.97917
## Gaston 113 27.50459 69 38.05682
## Pitt 109 29.86916 65 38.70213
## Robeson 105 36.48571 65 40.79518
## Onslow 101 30.30000 64 40.40244
## Nash 100 31.96907 68 37.42308
## Max Driver Age Population
## Wake 70 1072000
## Mecklenburg 70 804006
## Guilford 70 526593
## New Hanover 70 227198
## Durham 70 311640
## Cumberland 70 332546
## Buncombe 70 257607
## Orange 70 144946
## Forsyth 70 376320
## Gaston 70 220182
## Pitt 70 179042
## Robeson 70 132606
## Onslow 70 193893
## Nash 70 93991
Question 3
library(ggplot2)
load("C:/Users/laith/Desktop/CMDA/ncbikeday1.RData")
loc = subset(loc, lat > 0)
p = ggplot(loc, aes(x = long, y = lat))
#Top 10 largest cities in NC according to http://worldpopulationreview.com/states/north-carolina-population/cities/
city.names <- c("Charlotte", "Raleigh", "Greensboro","Durham", "Winston-Salem", "Fayetteville","Cary","Wilmington","High Point", "Greenville")
Latitude <- c(35.227,35.772,36.073,35.994,36.1,35.053,35.792,
34.226,35.956,35.613)
Longitude <- c(-80.843,-78.639,-79.792,-78.899,-80.244,-78.878,
-78.781,-77.945,-80.005,-77.366)
top10 <- data.frame("CityName" = city.names, "lat" = Latitude, "long" = Longitude)
p + geom_point() + geom_point(data = top10, aes(x = long, y = lat, color = "Major City"))
Question 4
library(ggplot2)
library(gridExtra)
library(grid)
library(lattice)
nc_bike_crash <- read.csv("C:/Users/laith/Desktop/CMDA/nc_bike_crash.csv", sep=";")
ggplot(nc_bike_crash, aes(x=Bike_Injur, y=Bike_Age)) + geom_boxplot() -> p1
ggplot(nc_bike_crash, aes(x=Drvr_Injur, y=Drvr_Age)) + geom_boxplot() -> p2
grid.arrange(p1, p2)
## Warning: Removed 130 rows containing non-finite values (stat_boxplot).
## Warning: Removed 1096 rows containing non-finite values (stat_boxplot).
Question 6
library(gcookbook)
data(uspopage)
library(ggplot2)
library(gridExtra)
library(grid)
library(lattice)
ggplot(uspopage, aes(x = Year, y = Thousands, fill = AgeGroup)) + geom_area() -> p1
ggplot(uspopage, aes(x = Year, y = Thousands, fill = AgeGroup)) +
geom_area(position = "fill", colour = "black", size = .2, alpha = .4) +
scale_y_continuous(labels = scales::percent)-> p2
grid.arrange(p1, p2)
uspopage is dataset that shows the estimated values by the us census of the population in the US between 1900-2002. The dataset has three variables, year, agegroup, and population in thousands. there are 834 observartions in the dataset.
Question 7
library(datasets)
data(faithful)
plot(x = faithful$eruptions, y = faithful$waiting)
The faithful dataset stored the waiting time between eruptions and the duration of the eruptions for the old faithful geyser in Yellowstone National Park. It has only two variables, eruption time and waiting time, and 272 observatios for each variable. When plotting the eruptions on the x axis and the waiting time on the y axis, it is apparant that longer eruptions require longer waiting times and vice versa. additionally there are very little data points with eruptions between 2.3 and 3.5 compared to the eruptions between 1.5 and 2.5 and 3.5 and 5.