### Jack Gonzalez R Assignment 1

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   1.0.1 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.5.0 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
vehicles <- read_csv("https://s3.amazonaws.com/itao-30230/vehicles.csv",
                     col_types="inincicccici")

vehicles <- vehicles %>%
  mutate(class=as.factor(class), drive=as.factor(drive), make=as.factor(make),
         transmissiontype=as.factor(transmissiontype))

###Problem 1
###In this part of the assignment, you will generate five graphs from this dataset meeting the 
###provided requirements.You should be able to build these visualizations without using the dplyr library.

###Part A
###Generate a scatterplot showing the miles per gallon that the vehicle experiences
###during city driving on the x-axis and the vehicle’s CO2 emissions on the y-axis.

ggplot(data = vehicles) + 
  geom_point(mapping = aes(x = citympg, y = co2emissions)) +
  ggtitle("City MPG vs. Emissions", subtitle = "Emissions decreased with better MPG")

###Part B
###Create a second scatterplot showing the same information as the plot from Part A but use color to distinguish
###vehicles by drive type.

ggplot(data = vehicles) + 
  geom_jitter(mapping = aes(x = citympg, y = co2emissions, color = drive))+
  ggtitle("City MPG vs. Emissions", subtitle = "Emissions decreased with better MPG")

###Part C
###Create a stacked bar chart that shows the number of vehicles tested each year broken out by vehicle class.

ggplot(data=vehicles) +
  geom_bar(mapping=aes(x=year, color = "Green", fill = class))+
  ggtitle("Number of Car Classes by Year")

###Part D
###Create a set of histograms that shows the number of vehicles tested by their mileage per gallon during 
###city driving.You should use one call to ggplot thatcreates separate histograms for each transmission type.


vehicles %>%
  ggplot(mapping=aes(x=citympg))+
  geom_histogram(fill="white", color = "black")+
  facet_grid(transmissiontype ~ .)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

###Problem 2 - Working with dplyr
###In this part of the assignment, you will expand your work to use the dplyrgrammar of data manipulation.
###For each part below, use a single R statement to perform the task. You may do this by joining together 
###dplyr verbs and the ggplot commands with %>% and +.
###Part A
###Print a table showing the minimum, maximum, mean, and median city MPG for vehicles tested, broken out
###by vehicle class. All values should be displayed as integers. Use the as.integer() and round() 
###functions, as necessary.


vehicles %>%
  group_by(class)%>%
  summarise(Minimum=as.integer(min(citympg)),Maximum = as.integer(max(citympg)), Mean = as.integer(mean(citympg)),
  Median=as.integer(median(citympg)))
## # A tibble: 10 × 5
##    class                   Minimum Maximum  Mean Median
##    <fct>                     <int>   <int> <int>  <int>
##  1 Compact Cars                  7      52    20     20
##  2 Large Cars                    7      57    16     16
##  3 Midsize Cars                  7      51    18     18
##  4 Minivan                      13      22    16     16
##  5 Pickup                        8      30    14     15
##  6 Special Purpose Vehicle       8      31    15     15
##  7 Sport Utility                10      34    16     16
##  8 Subcompact Cars               7      43    19     19
##  9 Two Seaters                   6      49    16     16
## 10 Vans                          8      23    13     13
###Part B
###Display a line graph showing the change in average city vs. highway MPG over time. Do not round the
###data to integers this time.  Show the city MPG as a red line and the highway MPG as a blue line.

vehicles %>%
  group_by(year)%>%
  summarize(
    count=n(),
    city=mean(citympg, na.rm=TRUE),
    highway=mean(highwaympg, na.rm=TRUE)
  )%>%
      ggplot(mapping=aes(x=year))+
     geom_line(mapping=aes(y = city, color = "red"))+
     geom_line(mapping=aes(y = highway, color = "blue"))

### Part C
### Modify the graph above to also show the overall MPG, computed as the average of city and 
###highway MPG.  Plot this as a green line.

vehicles %>%
  group_by(year)%>%
  summarize(
  count=n(),
  city=mean(citympg, na.rm=TRUE),
  highway=mean(highwaympg, na.rm=TRUE)
  )%>%
    ggplot(mapping=aes(x=year))+
    geom_line(mapping=aes(y = city, color = "red"))+
    geom_line(mapping=aes(y = highway, color = "blue"))+
    geom_line(mapping=aes(y=(city+highway)/2, color="green"))

###Part D
###Modify the graph above to show separate graphs for each drive type.

vehicles %>%
  group_by(year,drive)%>%
  summarize(
  count=n(),
  city=mean(citympg, na.rm=TRUE),
  highway=mean(highwaympg, na.rm=TRUE)
  )%>%
  ggplot(mapping=aes(x=year))+
  geom_line(mapping=aes(y = city, color = "red"))+
  geom_line(mapping=aes(y = highway, color = "blue"))+
  geom_line(mapping=aes(y=(city+highway)/2, color="green"))+
  facet_grid(drive ~.)
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.

###Problem 3 - Data Exploration and Visualization
 
###Use the strategies you learned in class as well as your own independent research to explore the vehicle testing and college datasets in more detail. 
###Create two interesting visualizations based upon your exploration.Take the time to beautify them.  
###Your grade for this portion of the assignment will be based upon your technical ability to create 
###the visualizations, their analytical uniqueness, and their appearance. Feel free to use the techniques 
###that we explored in class and/or to explore other features of ggplot2.For each of your visualizations, 
###provide a brief description of the insight that they provide.You should answer the question
###(with just a sentence or two),“What do we learn from this visualization?”You can provide your answer 
###as comments in your code or, if you would like to explore a new technology,try using RMarkdown.



view(vehicles)
Hmisc::describe(vehicles)
## vehicles 
## 
##  12  Variables      36979  Observations
## --------------------------------------------------------------------------------
## citympg 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##    36979        0       46    0.994    17.53    4.822       11       12 
##      .25      .50      .75      .90      .95 
##       15       17       20       23       26 
## 
## lowest :  6  7  8  9 10, highest: 49 51 52 55 57
## --------------------------------------------------------------------------------
## co2emissions 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##    36979        0      569    0.997    476.6    129.3    313.0    341.8 
##      .25      .50      .75      .90      .95 
##    400.0    467.7    555.4    634.8    683.6 
## 
## lowest :   29.0000   37.0000   40.0000   97.0000  101.0000
## highest:  847.0000  888.7000  987.4444 1110.8750 1269.5714
## --------------------------------------------------------------------------------
## cylinders 
##        n  missing distinct     Info     Mean      Gmd 
##    36979        0        9    0.892    5.776     1.85 
## 
## lowest :  2  3  4  5  6, highest:  6  8 10 12 16
##                                                                 
## Value          2     3     4     5     6     8    10    12    16
## Frequency     49   232 13719   730 13218  8287   153   582     9
## Proportion 0.001 0.006 0.371 0.020 0.357 0.224 0.004 0.016 0.000
## --------------------------------------------------------------------------------
## displacement 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##    36979        0       65    0.998    3.346    1.531      1.6      1.8 
##      .25      .50      .75      .90      .95 
##      2.2      3.0      4.3      5.4      5.9 
## 
## lowest : 0.6 0.9 1.0 1.1 1.2, highest: 7.0 7.4 8.0 8.3 8.4
## --------------------------------------------------------------------------------
## drive 
##        n  missing distinct 
##    36979        0        5 
## 
## lowest : 2-Wheel Drive     4-Wheel Drive     All-Wheel Drive   Front-Wheel Drive Rear-Wheel Drive 
## highest: 2-Wheel Drive     4-Wheel Drive     All-Wheel Drive   Front-Wheel Drive Rear-Wheel Drive 
##                                                                 
## Value          2-Wheel Drive     4-Wheel Drive   All-Wheel Drive
## Frequency                491              1349              8871
## Proportion             0.013             0.036             0.240
##                                               
## Value      Front-Wheel Drive  Rear-Wheel Drive
## Frequency              13074             13194
## Proportion             0.354             0.357
## --------------------------------------------------------------------------------
## highwaympg 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##    36979        0       50    0.997    23.77    6.371       15       17 
##      .25      .50      .75      .90      .95 
##       20       24       27       31       34 
## 
## lowest :  9 10 11 12 13, highest: 54 58 59 60 61
## --------------------------------------------------------------------------------
## make 
##        n  missing distinct 
##    36979        0      128 
## 
## lowest : Acura                       Alfa Romeo                  AM General                  American Motors Corporation ASC Incorporated           
## highest: Volkswagen                  Volvo                       VPG                         Wallace Environmental       Yugo                       
## --------------------------------------------------------------------------------
## model 
##        n  missing distinct 
##    36979        0     3650 
## 
## lowest : 1-Ton Truck 2WD   100               100 quattro       100 quattro Wagon 100 Wagon        
## highest: Z4 sDrive35i      Z4 sDrive35is     Z8                ZDX 4WD           Zephyr           
## --------------------------------------------------------------------------------
## class 
##        n  missing distinct 
##    36979        0       10 
## 
## lowest : Compact Cars            Large Cars              Midsize Cars            Minivan                 Pickup                 
## highest: Special Purpose Vehicle Sport Utility           Subcompact Cars         Two Seaters             Vans                   
## --------------------------------------------------------------------------------
## year 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##    36979        0       35    0.999     2001    11.99     1985     1987 
##      .25      .50      .75      .90      .95 
##     1991     2001     2010     2015     2017 
## 
## lowest : 1984 1985 1986 1987 1988, highest: 2014 2015 2016 2017 2018
## --------------------------------------------------------------------------------
## transmissiontype 
##        n  missing distinct 
##    36979        0        2 
##                               
## Value      Automatic    Manual
## Frequency      24910     12069
## Proportion     0.674     0.326
## --------------------------------------------------------------------------------
## transmissionspeeds 
##        n  missing distinct     Info     Mean      Gmd 
##    36979        0        9    0.928    4.954    1.315 
## 
## lowest :  1  3  4  5  6, highest:  6  7  8  9 10
##                                                                 
## Value          1     3     4     5     6     7     8     9    10
## Frequency      6  2799 12391 11004  7307  1683  1546   204    39
## Proportion 0.000 0.076 0.335 0.298 0.198 0.046 0.042 0.006 0.001
## --------------------------------------------------------------------------------
summary(vehicles)
##     citympg       co2emissions      cylinders       displacement  
##  Min.   : 6.00   Min.   :  29.0   Min.   : 2.000   Min.   :0.600  
##  1st Qu.:15.00   1st Qu.: 400.0   1st Qu.: 4.000   1st Qu.:2.200  
##  Median :17.00   Median : 467.7   Median : 6.000   Median :3.000  
##  Mean   :17.53   Mean   : 476.6   Mean   : 5.776   Mean   :3.346  
##  3rd Qu.:20.00   3rd Qu.: 555.4   3rd Qu.: 6.000   3rd Qu.:4.300  
##  Max.   :57.00   Max.   :1269.6   Max.   :16.000   Max.   :8.400  
##                                                                   
##                drive         highwaympg           make          model          
##  2-Wheel Drive    :  491   Min.   : 9.00   Chevrolet: 3750   Length:36979      
##  4-Wheel Drive    : 1349   1st Qu.:20.00   Ford     : 3044   Class :character  
##  All-Wheel Drive  : 8871   Median :24.00   Dodge    : 2461   Mode  :character  
##  Front-Wheel Drive:13074   Mean   :23.77   GMC      : 2414                     
##  Rear-Wheel Drive :13194   3rd Qu.:27.00   Toyota   : 1840                     
##                            Max.   :61.00   BMW      : 1774                     
##                                            (Other)  :21696                     
##                      class           year       transmissiontype
##  Compact Cars           :7918   Min.   :1984   Automatic:24910  
##  Pickup                 :5763   1st Qu.:1991   Manual   :12069  
##  Midsize Cars           :5226   Median :2001                    
##  Sport Utility          :5156   Mean   :2001                    
##  Subcompact Cars        :4523   3rd Qu.:2010                    
##  Special Purpose Vehicle:2378   Max.   :2018                    
##  (Other)                :6015                                   
##  transmissionspeeds
##  Min.   : 1.000    
##  1st Qu.: 4.000    
##  Median : 5.000    
##  Mean   : 4.954    
##  3rd Qu.: 6.000    
##  Max.   :10.000    
## 
str(vehicles)
## tibble [36,979 × 12] (S3: tbl_df/tbl/data.frame)
##  $ citympg           : int [1:36979] 22 21 23 23 15 17 19 19 19 21 ...
##  $ co2emissions      : num [1:36979] 386 404 370 342 523 ...
##  $ cylinders         : int [1:36979] 4 4 4 4 5 4 4 4 4 4 ...
##  $ displacement      : num [1:36979] 1.5 1.5 1.5 1.5 2.2 1.8 2 2 2 2 ...
##  $ drive             : Factor w/ 5 levels "2-Wheel Drive",..: 3 3 3 3 3 4 4 4 4 4 ...
##  $ highwaympg        : int [1:36979] 24 23 27 29 20 23 26 26 26 26 ...
##  $ make              : Factor w/ 128 levels "Acura","Alfa Romeo",..: 119 119 46 119 124 17 17 18 20 82 ...
##  $ model             : chr [1:36979] "Tercel Wagon 4WD" "Tercel Wagon 4WD" "Civic Wagon 4WD" "Tercel Wagon 4WD" ...
##  $ class             : Factor w/ 10 levels "Compact Cars",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ year              : int [1:36979] 1985 1985 1985 1985 1985 1985 1985 1985 1985 1985 ...
##  $ transmissiontype  : Factor w/ 2 levels "Automatic","Manual": 1 1 2 2 2 1 1 1 1 1 ...
##  $ transmissionspeeds: int [1:36979] 3 3 5 5 5 3 3 3 3 3 ...
### This graph shows how the bad the MPG is for the majority of vehicles in this data set. 
ggplot(data=vehicles) +
  geom_bar(mapping=aes(x=citympg, color = "Green", fill = year  ))+
  ggtitle("City MPG by Year")
## Warning: The following aesthetics were dropped during statistical transformation: fill
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

ggplot(data = vehicles) + 
  geom_jitter(mapping = aes(x = highwaympg, y = co2emissions, color = year))+
  ggtitle("Highway MPG vs. Year", subtitle = "Emissions decreased with Year")

###This graph depicts how year has a positive correlation with increased Highway MPG





College <- read_csv("https://s3.amazonaws.com/itao-30230/college.csv")
## Rows: 1270 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): name, city, state, region, highest_degree, control, gender, loan_de...
## dbl (9): id, admission_rate, sat_avg, undergrads, tuition, faculty_salary_av...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
summary(College)
##        id             name               city              state          
##  Min.   :100654   Length:1270        Length:1270        Length:1270       
##  1st Qu.:153255   Class :character   Class :character   Class :character  
##  Median :186327   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :187222                                                           
##  3rd Qu.:215291                                                           
##  Max.   :484905                                                           
##     region          highest_degree       control             gender         
##  Length:1270        Length:1270        Length:1270        Length:1270       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  admission_rate      sat_avg         undergrads       tuition     
##  Min.   :0.0509   Min.   : 720.0   Min.   :   47   Min.   : 2732  
##  1st Qu.:0.5339   1st Qu.: 973.2   1st Qu.: 1294   1st Qu.: 8966  
##  Median :0.6685   Median :1040.5   Median : 2554   Median :19995  
##  Mean   :0.6498   Mean   :1059.6   Mean   : 5625   Mean   :21011  
##  3rd Qu.:0.7857   3rd Qu.:1120.8   3rd Qu.: 6713   3rd Qu.:30355  
##  Max.   :1.0000   Max.   :1545.0   Max.   :52280   Max.   :51008  
##  faculty_salary_avg loan_default_rate   median_debt         lon         
##  Min.   : 1451      Length:1270        Min.   : 6056   Min.   :-157.92  
##  1st Qu.: 6191      Class :character   1st Qu.:21250   1st Qu.: -94.17  
##  Median : 7268      Mode  :character   Median :24544   Median : -84.88  
##  Mean   : 7655                         Mean   :23477   Mean   : -88.29  
##  3rd Qu.: 8670                         3rd Qu.:27000   3rd Qu.: -78.63  
##  Max.   :20650                         Max.   :41000   Max.   : -68.59  
##       lat       
##  Min.   :19.71  
##  1st Qu.:35.20  
##  Median :39.74  
##  Mean   :38.60  
##  3rd Qu.:41.81  
##  Max.   :61.22
ggplot(data = College) + 
  geom_point(mapping = aes(x = sat_avg, y = tuition)) +
  ggtitle("SAT AVG vs.Tuiton")

###