### Jack Gonzalez R Assignment 1
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 1.0.1
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
vehicles <- read_csv("https://s3.amazonaws.com/itao-30230/vehicles.csv",
col_types="inincicccici")
vehicles <- vehicles %>%
mutate(class=as.factor(class), drive=as.factor(drive), make=as.factor(make),
transmissiontype=as.factor(transmissiontype))
###Problem 1
###In this part of the assignment, you will generate five graphs from this dataset meeting the
###provided requirements.You should be able to build these visualizations without using the dplyr library.
###Part A
###Generate a scatterplot showing the miles per gallon that the vehicle experiences
###during city driving on the x-axis and the vehicle’s CO2 emissions on the y-axis.
ggplot(data = vehicles) +
geom_point(mapping = aes(x = citympg, y = co2emissions)) +
ggtitle("City MPG vs. Emissions", subtitle = "Emissions decreased with better MPG")

###Part B
###Create a second scatterplot showing the same information as the plot from Part A but use color to distinguish
###vehicles by drive type.
ggplot(data = vehicles) +
geom_jitter(mapping = aes(x = citympg, y = co2emissions, color = drive))+
ggtitle("City MPG vs. Emissions", subtitle = "Emissions decreased with better MPG")

###Part C
###Create a stacked bar chart that shows the number of vehicles tested each year broken out by vehicle class.
ggplot(data=vehicles) +
geom_bar(mapping=aes(x=year, color = "Green", fill = class))+
ggtitle("Number of Car Classes by Year")

###Part D
###Create a set of histograms that shows the number of vehicles tested by their mileage per gallon during
###city driving.You should use one call to ggplot thatcreates separate histograms for each transmission type.
vehicles %>%
ggplot(mapping=aes(x=citympg))+
geom_histogram(fill="white", color = "black")+
facet_grid(transmissiontype ~ .)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

###Problem 2 - Working with dplyr
###In this part of the assignment, you will expand your work to use the dplyrgrammar of data manipulation.
###For each part below, use a single R statement to perform the task. You may do this by joining together
###dplyr verbs and the ggplot commands with %>% and +.
###Part A
###Print a table showing the minimum, maximum, mean, and median city MPG for vehicles tested, broken out
###by vehicle class. All values should be displayed as integers. Use the as.integer() and round()
###functions, as necessary.
vehicles %>%
group_by(class)%>%
summarise(Minimum=as.integer(min(citympg)),Maximum = as.integer(max(citympg)), Mean = as.integer(mean(citympg)),
Median=as.integer(median(citympg)))
## # A tibble: 10 × 5
## class Minimum Maximum Mean Median
## <fct> <int> <int> <int> <int>
## 1 Compact Cars 7 52 20 20
## 2 Large Cars 7 57 16 16
## 3 Midsize Cars 7 51 18 18
## 4 Minivan 13 22 16 16
## 5 Pickup 8 30 14 15
## 6 Special Purpose Vehicle 8 31 15 15
## 7 Sport Utility 10 34 16 16
## 8 Subcompact Cars 7 43 19 19
## 9 Two Seaters 6 49 16 16
## 10 Vans 8 23 13 13
###Part B
###Display a line graph showing the change in average city vs. highway MPG over time. Do not round the
###data to integers this time. Show the city MPG as a red line and the highway MPG as a blue line.
vehicles %>%
group_by(year)%>%
summarize(
count=n(),
city=mean(citympg, na.rm=TRUE),
highway=mean(highwaympg, na.rm=TRUE)
)%>%
ggplot(mapping=aes(x=year))+
geom_line(mapping=aes(y = city, color = "red"))+
geom_line(mapping=aes(y = highway, color = "blue"))

### Part C
### Modify the graph above to also show the overall MPG, computed as the average of city and
###highway MPG. Plot this as a green line.
vehicles %>%
group_by(year)%>%
summarize(
count=n(),
city=mean(citympg, na.rm=TRUE),
highway=mean(highwaympg, na.rm=TRUE)
)%>%
ggplot(mapping=aes(x=year))+
geom_line(mapping=aes(y = city, color = "red"))+
geom_line(mapping=aes(y = highway, color = "blue"))+
geom_line(mapping=aes(y=(city+highway)/2, color="green"))

###Part D
###Modify the graph above to show separate graphs for each drive type.
vehicles %>%
group_by(year,drive)%>%
summarize(
count=n(),
city=mean(citympg, na.rm=TRUE),
highway=mean(highwaympg, na.rm=TRUE)
)%>%
ggplot(mapping=aes(x=year))+
geom_line(mapping=aes(y = city, color = "red"))+
geom_line(mapping=aes(y = highway, color = "blue"))+
geom_line(mapping=aes(y=(city+highway)/2, color="green"))+
facet_grid(drive ~.)
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.

###Problem 3 - Data Exploration and Visualization
###Use the strategies you learned in class as well as your own independent research to explore the vehicle testing and college datasets in more detail.
###Create two interesting visualizations based upon your exploration.Take the time to beautify them.
###Your grade for this portion of the assignment will be based upon your technical ability to create
###the visualizations, their analytical uniqueness, and their appearance. Feel free to use the techniques
###that we explored in class and/or to explore other features of ggplot2.For each of your visualizations,
###provide a brief description of the insight that they provide.You should answer the question
###(with just a sentence or two),“What do we learn from this visualization?”You can provide your answer
###as comments in your code or, if you would like to explore a new technology,try using RMarkdown.
view(vehicles)
Hmisc::describe(vehicles)
## vehicles
##
## 12 Variables 36979 Observations
## --------------------------------------------------------------------------------
## citympg
## n missing distinct Info Mean Gmd .05 .10
## 36979 0 46 0.994 17.53 4.822 11 12
## .25 .50 .75 .90 .95
## 15 17 20 23 26
##
## lowest : 6 7 8 9 10, highest: 49 51 52 55 57
## --------------------------------------------------------------------------------
## co2emissions
## n missing distinct Info Mean Gmd .05 .10
## 36979 0 569 0.997 476.6 129.3 313.0 341.8
## .25 .50 .75 .90 .95
## 400.0 467.7 555.4 634.8 683.6
##
## lowest : 29.0000 37.0000 40.0000 97.0000 101.0000
## highest: 847.0000 888.7000 987.4444 1110.8750 1269.5714
## --------------------------------------------------------------------------------
## cylinders
## n missing distinct Info Mean Gmd
## 36979 0 9 0.892 5.776 1.85
##
## lowest : 2 3 4 5 6, highest: 6 8 10 12 16
##
## Value 2 3 4 5 6 8 10 12 16
## Frequency 49 232 13719 730 13218 8287 153 582 9
## Proportion 0.001 0.006 0.371 0.020 0.357 0.224 0.004 0.016 0.000
## --------------------------------------------------------------------------------
## displacement
## n missing distinct Info Mean Gmd .05 .10
## 36979 0 65 0.998 3.346 1.531 1.6 1.8
## .25 .50 .75 .90 .95
## 2.2 3.0 4.3 5.4 5.9
##
## lowest : 0.6 0.9 1.0 1.1 1.2, highest: 7.0 7.4 8.0 8.3 8.4
## --------------------------------------------------------------------------------
## drive
## n missing distinct
## 36979 0 5
##
## lowest : 2-Wheel Drive 4-Wheel Drive All-Wheel Drive Front-Wheel Drive Rear-Wheel Drive
## highest: 2-Wheel Drive 4-Wheel Drive All-Wheel Drive Front-Wheel Drive Rear-Wheel Drive
##
## Value 2-Wheel Drive 4-Wheel Drive All-Wheel Drive
## Frequency 491 1349 8871
## Proportion 0.013 0.036 0.240
##
## Value Front-Wheel Drive Rear-Wheel Drive
## Frequency 13074 13194
## Proportion 0.354 0.357
## --------------------------------------------------------------------------------
## highwaympg
## n missing distinct Info Mean Gmd .05 .10
## 36979 0 50 0.997 23.77 6.371 15 17
## .25 .50 .75 .90 .95
## 20 24 27 31 34
##
## lowest : 9 10 11 12 13, highest: 54 58 59 60 61
## --------------------------------------------------------------------------------
## make
## n missing distinct
## 36979 0 128
##
## lowest : Acura Alfa Romeo AM General American Motors Corporation ASC Incorporated
## highest: Volkswagen Volvo VPG Wallace Environmental Yugo
## --------------------------------------------------------------------------------
## model
## n missing distinct
## 36979 0 3650
##
## lowest : 1-Ton Truck 2WD 100 100 quattro 100 quattro Wagon 100 Wagon
## highest: Z4 sDrive35i Z4 sDrive35is Z8 ZDX 4WD Zephyr
## --------------------------------------------------------------------------------
## class
## n missing distinct
## 36979 0 10
##
## lowest : Compact Cars Large Cars Midsize Cars Minivan Pickup
## highest: Special Purpose Vehicle Sport Utility Subcompact Cars Two Seaters Vans
## --------------------------------------------------------------------------------
## year
## n missing distinct Info Mean Gmd .05 .10
## 36979 0 35 0.999 2001 11.99 1985 1987
## .25 .50 .75 .90 .95
## 1991 2001 2010 2015 2017
##
## lowest : 1984 1985 1986 1987 1988, highest: 2014 2015 2016 2017 2018
## --------------------------------------------------------------------------------
## transmissiontype
## n missing distinct
## 36979 0 2
##
## Value Automatic Manual
## Frequency 24910 12069
## Proportion 0.674 0.326
## --------------------------------------------------------------------------------
## transmissionspeeds
## n missing distinct Info Mean Gmd
## 36979 0 9 0.928 4.954 1.315
##
## lowest : 1 3 4 5 6, highest: 6 7 8 9 10
##
## Value 1 3 4 5 6 7 8 9 10
## Frequency 6 2799 12391 11004 7307 1683 1546 204 39
## Proportion 0.000 0.076 0.335 0.298 0.198 0.046 0.042 0.006 0.001
## --------------------------------------------------------------------------------
summary(vehicles)
## citympg co2emissions cylinders displacement
## Min. : 6.00 Min. : 29.0 Min. : 2.000 Min. :0.600
## 1st Qu.:15.00 1st Qu.: 400.0 1st Qu.: 4.000 1st Qu.:2.200
## Median :17.00 Median : 467.7 Median : 6.000 Median :3.000
## Mean :17.53 Mean : 476.6 Mean : 5.776 Mean :3.346
## 3rd Qu.:20.00 3rd Qu.: 555.4 3rd Qu.: 6.000 3rd Qu.:4.300
## Max. :57.00 Max. :1269.6 Max. :16.000 Max. :8.400
##
## drive highwaympg make model
## 2-Wheel Drive : 491 Min. : 9.00 Chevrolet: 3750 Length:36979
## 4-Wheel Drive : 1349 1st Qu.:20.00 Ford : 3044 Class :character
## All-Wheel Drive : 8871 Median :24.00 Dodge : 2461 Mode :character
## Front-Wheel Drive:13074 Mean :23.77 GMC : 2414
## Rear-Wheel Drive :13194 3rd Qu.:27.00 Toyota : 1840
## Max. :61.00 BMW : 1774
## (Other) :21696
## class year transmissiontype
## Compact Cars :7918 Min. :1984 Automatic:24910
## Pickup :5763 1st Qu.:1991 Manual :12069
## Midsize Cars :5226 Median :2001
## Sport Utility :5156 Mean :2001
## Subcompact Cars :4523 3rd Qu.:2010
## Special Purpose Vehicle:2378 Max. :2018
## (Other) :6015
## transmissionspeeds
## Min. : 1.000
## 1st Qu.: 4.000
## Median : 5.000
## Mean : 4.954
## 3rd Qu.: 6.000
## Max. :10.000
##
str(vehicles)
## tibble [36,979 × 12] (S3: tbl_df/tbl/data.frame)
## $ citympg : int [1:36979] 22 21 23 23 15 17 19 19 19 21 ...
## $ co2emissions : num [1:36979] 386 404 370 342 523 ...
## $ cylinders : int [1:36979] 4 4 4 4 5 4 4 4 4 4 ...
## $ displacement : num [1:36979] 1.5 1.5 1.5 1.5 2.2 1.8 2 2 2 2 ...
## $ drive : Factor w/ 5 levels "2-Wheel Drive",..: 3 3 3 3 3 4 4 4 4 4 ...
## $ highwaympg : int [1:36979] 24 23 27 29 20 23 26 26 26 26 ...
## $ make : Factor w/ 128 levels "Acura","Alfa Romeo",..: 119 119 46 119 124 17 17 18 20 82 ...
## $ model : chr [1:36979] "Tercel Wagon 4WD" "Tercel Wagon 4WD" "Civic Wagon 4WD" "Tercel Wagon 4WD" ...
## $ class : Factor w/ 10 levels "Compact Cars",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ year : int [1:36979] 1985 1985 1985 1985 1985 1985 1985 1985 1985 1985 ...
## $ transmissiontype : Factor w/ 2 levels "Automatic","Manual": 1 1 2 2 2 1 1 1 1 1 ...
## $ transmissionspeeds: int [1:36979] 3 3 5 5 5 3 3 3 3 3 ...
### This graph shows how the bad the MPG is for the majority of vehicles in this data set.
ggplot(data=vehicles) +
geom_bar(mapping=aes(x=citympg, color = "Green", fill = year ))+
ggtitle("City MPG by Year")
## Warning: The following aesthetics were dropped during statistical transformation: fill
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?

ggplot(data = vehicles) +
geom_jitter(mapping = aes(x = highwaympg, y = co2emissions, color = year))+
ggtitle("Highway MPG vs. Year", subtitle = "Emissions decreased with Year")

###This graph depicts how year has a positive correlation with increased Highway MPG
College <- read_csv("https://s3.amazonaws.com/itao-30230/college.csv")
## Rows: 1270 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): name, city, state, region, highest_degree, control, gender, loan_de...
## dbl (9): id, admission_rate, sat_avg, undergrads, tuition, faculty_salary_av...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
summary(College)
## id name city state
## Min. :100654 Length:1270 Length:1270 Length:1270
## 1st Qu.:153255 Class :character Class :character Class :character
## Median :186327 Mode :character Mode :character Mode :character
## Mean :187222
## 3rd Qu.:215291
## Max. :484905
## region highest_degree control gender
## Length:1270 Length:1270 Length:1270 Length:1270
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## admission_rate sat_avg undergrads tuition
## Min. :0.0509 Min. : 720.0 Min. : 47 Min. : 2732
## 1st Qu.:0.5339 1st Qu.: 973.2 1st Qu.: 1294 1st Qu.: 8966
## Median :0.6685 Median :1040.5 Median : 2554 Median :19995
## Mean :0.6498 Mean :1059.6 Mean : 5625 Mean :21011
## 3rd Qu.:0.7857 3rd Qu.:1120.8 3rd Qu.: 6713 3rd Qu.:30355
## Max. :1.0000 Max. :1545.0 Max. :52280 Max. :51008
## faculty_salary_avg loan_default_rate median_debt lon
## Min. : 1451 Length:1270 Min. : 6056 Min. :-157.92
## 1st Qu.: 6191 Class :character 1st Qu.:21250 1st Qu.: -94.17
## Median : 7268 Mode :character Median :24544 Median : -84.88
## Mean : 7655 Mean :23477 Mean : -88.29
## 3rd Qu.: 8670 3rd Qu.:27000 3rd Qu.: -78.63
## Max. :20650 Max. :41000 Max. : -68.59
## lat
## Min. :19.71
## 1st Qu.:35.20
## Median :39.74
## Mean :38.60
## 3rd Qu.:41.81
## Max. :61.22
ggplot(data = College) +
geom_point(mapping = aes(x = sat_avg, y = tuition)) +
ggtitle("SAT AVG vs.Tuiton")

###