Directions

The objective of this assignment is to complete and explain basic plots before moving on to more complicated ways to graph data.

Each question is worth 5 points.

To submit this homework you will create the document in Rstudio, using the knitr package (button included in Rstudio) and then submit the document to your Rpubs account. Once uploaded you will submit the link to that document on Canvas. Please make sure that this link is hyper linked and that I can see the visualization and the code required to create it (echo=TRUE).

Questions

  1. For The following questions use the Marriage data set from the mosaicData package.
library(ggplot2)

library(reshape)
## 
## Attaching package: 'reshape'
## The following object is masked from 'package:dplyr':
## 
##     rename
## The following objects are masked from 'package:tidyr':
## 
##     expand, smiths
cleanup = theme(panel.background = element_blank(), 
                panel.grid.major = element_blank(),
                panel.grid.minor = element_blank(),
                axis.line.x = element_line(colour = "black"),
                axis.line.y = element_line(colour = 'black'),
                legend.key = element_rect(colour = "white"),
                text = element_text(size = 11))



## Loading the Marriage Dataset.

Data_Marriage<- Marriage

str(Data_Marriage)
## 'data.frame':    98 obs. of  15 variables:
##  $ bookpageID   : Factor w/ 49 levels "B230p1209","B230p1354",..: 5 6 7 8 9 1 2 3 4 16 ...
##  $ appdate      : Date, format: "1996-10-29" "1996-11-12" ...
##  $ ceremonydate : Date, format: "1996-11-09" "1996-11-12" ...
##  $ delay        : int  11 0 8 5 5 0 16 0 28 10 ...
##  $ officialTitle: Factor w/ 9 levels "BISHOP","CATHOLIC PRIEST",..: 4 6 6 7 7 6 6 6 6 7 ...
##  $ person       : Factor w/ 2 levels "Bride","Groom": 2 2 2 2 2 2 2 2 2 2 ...
##  $ dob          : Date, format: "1964-04-11" "1964-08-06" ...
##  $ age          : num  32.6 32.3 34.8 40.6 30 ...
##  $ race         : Factor w/ 4 levels "American Indian",..: 4 4 3 2 4 4 4 4 2 4 ...
##  $ prevcount    : int  0 1 1 1 0 1 1 1 0 3 ...
##  $ prevconc     : Factor w/ 2 levels "Death","Divorce": NA 2 2 2 NA NA 2 2 NA 2 ...
##  $ hs           : int  12 12 12 12 12 12 12 12 12 12 ...
##  $ college      : int  7 0 3 4 0 0 0 0 0 6 ...
##  $ dayOfBirth   : num  102 219 51 141 348 52 284 31 338 183 ...
##  $ sign         : Factor w/ 12 levels "Aquarius","Aries",..: 2 6 8 5 9 8 7 1 9 3 ...
## Missing Data evaluation in the Marriage data set:

any(is.na(Data_Marriage))
## [1] TRUE
# Total missing values.

sum(is.na(Data_Marriage)) 
## [1] 58
## Removal of the remaining 58 missing values or complete "na" values form the 
# data set for accurate and error free analytically computations moving forward.


Clean_Marriage_Data = drop_na(Data_Marriage)

as.data.frame(colSums(is.na(Clean_Marriage_Data)))
##               colSums(is.na(Clean_Marriage_Data))
## bookpageID                                      0
## appdate                                         0
## ceremonydate                                    0
## delay                                           0
## officialTitle                                   0
## person                                          0
## dob                                             0
## age                                             0
## race                                            0
## prevcount                                       0
## prevconc                                        0
## hs                                              0
## college                                         0
## dayOfBirth                                      0
## sign                                            0
## Missing Data re-evaluation in the Marriage data set after data cleansing:

any(is.na(Clean_Marriage_Data)) ## Missing values eliminated from the clean marriage dataset.
## [1] FALSE
# a1) Create an informative and meaningful data graphic.


# Ans (a): 

## Total race count calculation:

summary(Clean_Marriage_Data$race)
## American Indian           Black        Hispanic           White 
##               0               9               1              34
## Determination of an informative and meaningful data graphic, depicting number of college years count scale for different races:

## Plot Data Object layer:

Marriage_Data_Plot_a<-ggplot(Clean_Marriage_Data, 
                                   aes(x=college, fill =race ))

# Addition of subsequent layers to the grouped scatter object layer:

Marriage_Data_Plot_a<- Marriage_Data_Plot_a + geom_histogram(binwidth = 1, position = "dodge") +
  labs(title = "Histogram Plot for Number of College Years Count for different Races", x = "Number of College Years Scale ", y = "Count  Scale") + scale_fill_discrete("Different Races") + cleanup


ggsave("Marriage_Data_Plot_a.png")
## Saving 7 x 5 in image
knitr::include_graphics("Marriage_Data_Plot_a.png")

# a2) Identify each of the visual cues that you are using, and describe how they are related to each variable.

## Some of the visual cues in the plotting histogram plot is "color" (for depicting and distinction between different races), and "length" for representing the histogram bars different count scale for different races.



# b) Create a data graphic with at least five variables (either quantitative or categorical). For the purposes of this exercise, do not worry about making your visualization meaningful—just try to encode five variables into one plot.

## Ans:

## Determination for the plot with five variables i.e. "person", "race", "age", "college", and "officialTitle" from the marriage dataset:


Marriage_Data_Plot_b<- ggplot(Clean_Marriage_Data, aes(x=age, y=college)) +
  
  geom_point(aes(color = race, shape = officialTitle), size = 3) +
  
  labs(title = "Different Races, and Person's Age groups, Official Title & College Years Count Plot", 
       x = "Age Sclae ", y = "College Years Scale") + facet_wrap(~person, nrow = 2) + scale_color_discrete("Different Races") + scale_shape_discrete("Official Titles") + cleanup



ggsave("Marriage_Data_Plot_b.png")
## Saving 7 x 5 in image
knitr::include_graphics("Marriage_Data_Plot_b.png")

Your objective for the next four questions will be write the code necessary to exactly recreate the provided graphics.

  1. Boxplot Visualization

This boxplot was built using the mpg dataset. Notice the changes in axis labels.

Data_mpg<- mpg

str(Data_mpg)
## tibble [234 × 11] (S3: tbl_df/tbl/data.frame)
##  $ manufacturer: chr [1:234] "audi" "audi" "audi" "audi" ...
##  $ model       : chr [1:234] "a4" "a4" "a4" "a4" ...
##  $ displ       : num [1:234] 1.8 1.8 2 2 2.8 2.8 3.1 1.8 1.8 2 ...
##  $ year        : int [1:234] 1999 1999 2008 2008 1999 1999 2008 1999 1999 2008 ...
##  $ cyl         : int [1:234] 4 4 4 4 6 6 6 4 4 4 ...
##  $ trans       : chr [1:234] "auto(l5)" "manual(m5)" "manual(m6)" "auto(av)" ...
##  $ drv         : chr [1:234] "f" "f" "f" "f" ...
##  $ cty         : int [1:234] 18 21 20 21 16 18 18 18 16 20 ...
##  $ hwy         : int [1:234] 29 29 31 30 26 26 27 26 25 28 ...
##  $ fl          : chr [1:234] "p" "p" "p" "p" ...
##  $ class       : chr [1:234] "compact" "compact" "compact" "compact" ...
## Determination of Boxplot and coding and replication of the exact plot for Problem # 2 from HW-4 html file:


Mpg_box_plot <- ggplot(mpg, aes(manufacturer,hwy)) + 
  geom_boxplot() + 
  coord_flip() + 
  theme_classic() + 
  labs(x="Vehicle Manufacturer",y = "Highway Fuel Efficiency (mile/gallon)")



ggsave("Mpg_box_plot.png")
## Saving 7 x 5 in image
knitr::include_graphics("Mpg_box_plot.png")

  1. Stacked Density Plot

This graphic is built with the diamonds dataset in the ggplot2 package.

Data_diamonds<- diamonds


str(Data_diamonds)
## tibble [53,940 × 10] (S3: tbl_df/tbl/data.frame)
##  $ carat  : num [1:53940] 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
##  $ cut    : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
##  $ color  : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
##  $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
##  $ depth  : num [1:53940] 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
##  $ table  : num [1:53940] 55 61 65 58 58 57 57 55 61 61 ...
##  $ price  : int [1:53940] 326 326 327 334 335 336 336 337 337 338 ...
##  $ x      : num [1:53940] 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
##  $ y      : num [1:53940] 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
##  $ z      : num [1:53940] 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
## Determination of Density plot and coding and replication of the exact plot for Problem # 3 from HW-4 html file:


Data_diamonds_Plot <- 
  ggplot(Data_diamonds, aes(price, fill= cut)) +
  geom_density(alpha = 0.1, size = 0.1) +
  scale_fill_brewer(palette = 'Dark2') +
  labs(title = "Diamond Price Density") +  
  labs(x="Diamond Price (USD)",y = "Density") +
  scale_fill_discrete("Cut")
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.
ggsave("Data_diamonds_Plot.png")
## Saving 7 x 5 in image
knitr::include_graphics("Data_diamonds_Plot.png")

  1. Sideways bar plot

This graphic uses the penguins dataset and shows the counts between males and females by species.

Data_penguins <- penguins

str(Data_penguins)
## tibble [344 × 8] (S3: tbl_df/tbl/data.frame)
##  $ species          : Factor w/ 3 levels "Adelie","Chinstrap",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ island           : Factor w/ 3 levels "Biscoe","Dream",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ bill_length_mm   : num [1:344] 39.1 39.5 40.3 NA 36.7 39.3 38.9 39.2 34.1 42 ...
##  $ bill_depth_mm    : num [1:344] 18.7 17.4 18 NA 19.3 20.6 17.8 19.6 18.1 20.2 ...
##  $ flipper_length_mm: int [1:344] 181 186 195 NA 193 190 181 195 193 190 ...
##  $ body_mass_g      : int [1:344] 3750 3800 3250 NA 3450 3650 3625 4675 3475 4250 ...
##  $ sex              : Factor w/ 2 levels "female","male": 2 1 1 NA 1 2 1 2 NA NA ...
##  $ year             : int [1:344] 2007 2007 2007 2007 2007 2007 2007 2007 2007 2007 ...
## Determination of plot and coding and replication of the exact plot for Problem # 4 from HW-4 html file:

Data_penguins_Plot <- 
Data_penguins %>%
  count(sex, species) %>%
  ggplot() + 
  geom_col(aes(x = sex, y = n, fill = species)) +
  scale_fill_manual(values = c("darkorange", "purple", "cyan4")) + 
  facet_wrap(~species, ncol = 1) + 
  theme_minimal() + 
  labs(x = "Sex", y = "Count") + 
  theme(legend.position = 'none') + 
  coord_flip()


ggsave("Data_penguins_Plot.png")
## Saving 7 x 5 in image
knitr::include_graphics("Data_penguins_Plot.png")

  1. Scatterplot

This figure examines the relationship between bill length and depth in the penguins dataset.

Data_penguins <- penguins


str(Data_penguins)
## tibble [344 × 8] (S3: tbl_df/tbl/data.frame)
##  $ species          : Factor w/ 3 levels "Adelie","Chinstrap",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ island           : Factor w/ 3 levels "Biscoe","Dream",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ bill_length_mm   : num [1:344] 39.1 39.5 40.3 NA 36.7 39.3 38.9 39.2 34.1 42 ...
##  $ bill_depth_mm    : num [1:344] 18.7 17.4 18 NA 19.3 20.6 17.8 19.6 18.1 20.2 ...
##  $ flipper_length_mm: int [1:344] 181 186 195 NA 193 190 181 195 193 190 ...
##  $ body_mass_g      : int [1:344] 3750 3800 3250 NA 3450 3650 3625 4675 3475 4250 ...
##  $ sex              : Factor w/ 2 levels "female","male": 2 1 1 NA 1 2 1 2 NA NA ...
##  $ year             : int [1:344] 2007 2007 2007 2007 2007 2007 2007 2007 2007 2007 ...
## Determination for scatterplot of bill_length_mm against bill_depth_mm, and individual penguins species colored and a regression line further added:

# Ggplot Object layer:

Penguins_Scatter_Plot_a<-ggplot(Data_penguins, 
                                   aes(x=bill_length_mm, y=bill_depth_mm, shape = species, color =species ))

# Addition of subsequent layers to the grouped scatter object layer

Penguin_a <- Penguins_Scatter_Plot_a + geom_point() +
  labs(title = "Scatter Plot of Bill Length (mm) and Bill Depth (mm) for differnet Species ", 
       x = "Bill Length (mm) ", y = "Bill Depth (mm)") + 
  scale_color_manual(values = c("darkorange", "darkorchid", "cyan4")) + geom_smooth(method = "lm", se = FALSE) 



ggsave("penguins_a.png")

knitr::include_graphics("penguins_a.png")