Load required packages

knitr::opts_chunk$set(echo = TRUE)

# References
# ggplot types reference - http://r-statistics.co/Top50-Ggplot2-Visualizations-MasterList-R-Code.html#Bar%20Chart

# Clear the console
cat("\014")

# Load the library
# Check if the package is installed. If not, install the package
if(!require('tidyr')) {
  install.packages('tidyr')
  library(tidyr)
}
## Loading required package: tidyr
## Warning: package 'tidyr' was built under R version 3.3.3
# Check if the package is installed. If not, install the package
if(!require('dplyr')) {
  install.packages('dplyr')
  library(dplyr)
}
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 3.3.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Check if the package is installed. If not, install the package
if(!require('ggplot2')) {
  install.packages('ggplot2')
  library(ggplot2)
}
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.3.3
# Flights data used for this Project
# Check if the package is installed. If not, install the package
if(!require('nycflights13')) {
  install.packages('nycflights13')
  library(nycflights13)
}
## Loading required package: nycflights13
## Warning: package 'nycflights13' was built under R version 3.3.3
# Check if the package is installed. If not, install the package
if(!require('reshape2')) {
  install.packages('reshape2')
  library(reshape2)
}
## Loading required package: reshape2
## Warning: package 'reshape2' was built under R version 3.3.3
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
# Check if the package is installed. If not, install the package
if(!require('plotrix')) {
  install.packages('plotrix')
  library(plotrix)
}
## Loading required package: plotrix
## Warning: package 'plotrix' was built under R version 3.3.3

Dataset 1 - AMES Data

#DATASET 1 - AMES Dataset
amesData <- read.csv("https://raw.githubusercontent.com/kalyanparthasarathy/DATA607/master/AMES%20RESIDENTIAL%20HOME%20SALES%20(AMES).csv")
head(amesData)
##   Order       PID MS.SubClass MS.Zoning Lot.Frontage Lot.Area Street Alley
## 1     1 526301100          20        RL          141    31770   Pave  <NA>
## 2     2 526350040          20        RH           80    11622   Pave  <NA>
## 3     3 526351010          20        RL           81    14267   Pave  <NA>
## 4     4 526353030          20        RL           93    11160   Pave  <NA>
## 5     5 527105010          60        RL           74    13830   Pave  <NA>
## 6     6 527105030          60        RL           78     9978   Pave  <NA>
##   Lot.Shape Land.Contour Utilities Lot.Config Land.Slope Neighborhood
## 1       IR1          Lvl    AllPub     Corner        Gtl        NAmes
## 2       Reg          Lvl    AllPub     Inside        Gtl        NAmes
## 3       IR1          Lvl    AllPub     Corner        Gtl        NAmes
## 4       Reg          Lvl    AllPub     Corner        Gtl        NAmes
## 5       IR1          Lvl    AllPub     Inside        Gtl      Gilbert
## 6       IR1          Lvl    AllPub     Inside        Gtl      Gilbert
##   Condition.1 Condition.2 Bldg.Type House.Style Overall.Qual Overall.Cond
## 1        Norm        Norm      1Fam      1Story            6            5
## 2       Feedr        Norm      1Fam      1Story            5            6
## 3        Norm        Norm      1Fam      1Story            6            6
## 4        Norm        Norm      1Fam      1Story            7            5
## 5        Norm        Norm      1Fam      2Story            5            5
## 6        Norm        Norm      1Fam      2Story            6            6
##   Year.Built Year.Remod.Add Roof.Style Roof.Matl Exterior.1st Exterior.2nd
## 1       1960           1960        Hip   CompShg      BrkFace      Plywood
## 2       1961           1961      Gable   CompShg      VinylSd      VinylSd
## 3       1958           1958        Hip   CompShg      Wd Sdng      Wd Sdng
## 4       1968           1968        Hip   CompShg      BrkFace      BrkFace
## 5       1997           1998      Gable   CompShg      VinylSd      VinylSd
## 6       1998           1998      Gable   CompShg      VinylSd      VinylSd
##   Mas.Vnr.Type Mas.Vnr.Area Exter.Qual Exter.Cond Foundation Bsmt.Qual
## 1        Stone          112         TA         TA     CBlock        TA
## 2         None            0         TA         TA     CBlock        TA
## 3      BrkFace          108         TA         TA     CBlock        TA
## 4         None            0         Gd         TA     CBlock        TA
## 5         None            0         TA         TA      PConc        Gd
## 6      BrkFace           20         TA         TA      PConc        TA
##   Bsmt.Cond Bsmt.Exposure BsmtFin.Type.1 BsmtFin.SF.1 BsmtFin.Type.2
## 1        Gd            Gd            BLQ          639            Unf
## 2        TA            No            Rec          468            LwQ
## 3        TA            No            ALQ          923            Unf
## 4        TA            No            ALQ         1065            Unf
## 5        TA            No            GLQ          791            Unf
## 6        TA            No            GLQ          602            Unf
##   BsmtFin.SF.2 Bsmt.Unf.SF Total.Bsmt.SF Heating Heating.QC Central.Air
## 1            0         441          1080    GasA         Fa           Y
## 2          144         270           882    GasA         TA           Y
## 3            0         406          1329    GasA         TA           Y
## 4            0        1045          2110    GasA         Ex           Y
## 5            0         137           928    GasA         Gd           Y
## 6            0         324           926    GasA         Ex           Y
##   Electrical X1st.Flr.SF X2nd.Flr.SF Low.Qual.Fin.SF Gr.Liv.Area
## 1      SBrkr        1656           0               0        1656
## 2      SBrkr         896           0               0         896
## 3      SBrkr        1329           0               0        1329
## 4      SBrkr        2110           0               0        2110
## 5      SBrkr         928         701               0        1629
## 6      SBrkr         926         678               0        1604
##   Bsmt.Full.Bath Bsmt.Half.Bath Full.Bath Half.Bath Bedroom.AbvGr
## 1              1              0         1         0             3
## 2              0              0         1         0             2
## 3              0              0         1         1             3
## 4              1              0         2         1             3
## 5              0              0         2         1             3
## 6              0              0         2         1             3
##   Kitchen.AbvGr Kitchen.Qual TotRms.AbvGrd Functional Fireplaces
## 1             1           TA             7        Typ          2
## 2             1           TA             5        Typ          0
## 3             1           Gd             6        Typ          0
## 4             1           Ex             8        Typ          2
## 5             1           TA             6        Typ          1
## 6             1           Gd             7        Typ          1
##   Fireplace.Qu Garage.Type Garage.Yr.Blt Garage.Finish Garage.Cars
## 1           Gd      Attchd          1960           Fin           2
## 2         <NA>      Attchd          1961           Unf           1
## 3         <NA>      Attchd          1958           Unf           1
## 4           TA      Attchd          1968           Fin           2
## 5           TA      Attchd          1997           Fin           2
## 6           Gd      Attchd          1998           Fin           2
##   Garage.Area Garage.Qual Garage.Cond Paved.Drive Wood.Deck.SF
## 1         528          TA          TA           P          210
## 2         730          TA          TA           Y          140
## 3         312          TA          TA           Y          393
## 4         522          TA          TA           Y            0
## 5         482          TA          TA           Y          212
## 6         470          TA          TA           Y          360
##   Open.Porch.SF Enclosed.Porch X3Ssn.Porch Screen.Porch Pool.Area Pool.QC
## 1            62              0           0            0         0    <NA>
## 2             0              0           0          120         0    <NA>
## 3            36              0           0            0         0    <NA>
## 4             0              0           0            0         0    <NA>
## 5            34              0           0            0         0    <NA>
## 6            36              0           0            0         0    <NA>
##   Fence Misc.Feature Misc.Val Mo.Sold Yr.Sold Sale.Type Sale.Condition
## 1  <NA>         <NA>        0       5    2010       WD          Normal
## 2 MnPrv         <NA>        0       6    2010       WD          Normal
## 3  <NA>         Gar2    12500       6    2010       WD          Normal
## 4  <NA>         <NA>        0       4    2010       WD          Normal
## 5 MnPrv         <NA>        0       3    2010       WD          Normal
## 6  <NA>         <NA>        0       6    2010       WD          Normal
##   SalePrice
## 1    215000
## 2    105000
## 3    172000
## 4    244000
## 5    189900
## 6    195500
# 1. Find the Mean Sale price for each year for each house type
amesDataForAnalysis <- amesData[, c(1, 2, 17, 48, 21, 79, 82)]
names(amesDataForAnalysis) <- c("S.No", "PID", "House_Style", "Living_Area_Size", "Year_Built", "Year_Sold", "Sale_Price")

amesDataMean <- group_by(amesDataForAnalysis, Year_Sold, House_Style)
amesDataMeanSummary <- summarise(amesDataMean, meanValue = mean(Sale_Price), NumberOfHomes = n())

# Mean price for all house types
amesDataMeanAll <- group_by(amesDataForAnalysis, Year_Sold)
amesDataMeanSummaryAll <- summarise(amesDataMeanAll, meanValue = mean(Sale_Price), NumberOfHomes = n())
amesDataMeanSummaryAll
## # A tibble: 5 x 3
##   Year_Sold meanValue NumberOfHomes
##       <int>     <dbl>         <int>
## 1      2006  181761.6           625
## 2      2007  185138.2           694
## 3      2008  178841.8           622
## 4      2009  181404.6           648
## 5      2010  172597.6           341
# 2. Draw a comaprison graph for Mean price by year
theme_set(theme_classic())

# Tidy dataframe for Mean Summary for all house types
amesDataMeanSummaryAllPlot <- ggplot(amesDataMeanSummaryAll, aes(Year_Sold, meanValue))
amesDataMeanSummaryAllPlot +
  geom_bar(stat="identity", width = 0.2, fill="tomato2") + 
  labs(title="Mean Price Bar Chart", 
       subtitle="All Home sold between 2006 - 2010", 
       caption="Source: AMES Dataset from OpenIntro Statistics website") +
  theme(axis.text.x = element_text(angle=65, vjust=0.6))

# Mean Summary for each house type
# 1Fam TidyData
amesDataMeanSummaryPlot <- ggplot(filter(amesDataMeanSummary, House_Style=='1Fam'), aes(Year_Sold, meanValue))
## Warning: package 'bindrcpp' was built under R version 3.3.3
amesDataMeanSummaryPlot +
  geom_bar(stat="identity", width = 0.2, fill="tomato2") + 
  labs(title="Mean Price Bar Chart", 
       subtitle="All 1Fam Home sold between 2006 - 2010", 
       caption="Source: AMES Dataset from OpenIntro Statistics website") +
  theme(axis.text.x = element_text(angle=65, vjust=0.6))

# 2fmCon TidyData
amesDataMeanSummaryPlot <- ggplot(filter(amesDataMeanSummary, House_Style=='2fmCon'), aes(Year_Sold, meanValue))
amesDataMeanSummaryPlot +
  geom_bar(stat="identity", width = 0.2, fill="tomato2") + 
  labs(title="Mean Price Bar Chart", 
       subtitle="All 2fmCon Home sold between 2006 - 2010", 
       caption="Source: AMES Dataset from OpenIntro Statistics website") +
  theme(axis.text.x = element_text(angle=65, vjust=0.6))

# Duplex
amesDataMeanSummaryPlot <- ggplot(filter(amesDataMeanSummary, House_Style=='Duplex'), aes(Year_Sold, meanValue))
amesDataMeanSummaryPlot +
  geom_bar(stat="identity", width = 0.2, fill="tomato2") + 
  labs(title="Mean Price Bar Chart", 
       subtitle="All Duplex Home sold between 2006 - 2010", 
       caption="Source: AMES Dataset from OpenIntro Statistics website") +
  theme(axis.text.x = element_text(angle=65, vjust=0.6))

# Twnhs
amesDataMeanSummaryPlot <- ggplot(filter(amesDataMeanSummary, House_Style=='Twnhs'), aes(Year_Sold, meanValue))
amesDataMeanSummaryPlot +
  geom_bar(stat="identity", width = 0.2, fill="tomato2") + 
  labs(title="Mean Price Bar Chart", 
       subtitle="All Twnhs Home sold between 2006 - 2010", 
       caption="Source: AMES Dataset from OpenIntro Statistics website") +
  theme(axis.text.x = element_text(angle=65, vjust=0.6))

# TwnhsE
amesDataMeanSummaryPlot <- ggplot(filter(amesDataMeanSummary, House_Style=='TwnhsE'), aes(Year_Sold, meanValue))
amesDataMeanSummaryPlot +
  geom_bar(stat="identity", width = 0.2, fill="tomato2") + 
  labs(title="Mean Price Bar Chart", 
       subtitle="All TwnhsE Home sold between 2006 - 2010", 
       caption="Source: AMES Dataset from OpenIntro Statistics website") +
  theme(axis.text.x = element_text(angle=65, vjust=0.6))

# 3. Sales record count by home type (1 story, 1.5 story etc.)
# Mean price by house type
amesDataMean <- group_by(amesDataForAnalysis, Year_Sold, House_Style)
amesDataMeanSummary <- summarise(amesDataMean, meanValue = mean(Sale_Price), NumberOfHomes = n())
amesDataMeanSummary
## # A tibble: 25 x 4
## # Groups:   Year_Sold [?]
##    Year_Sold House_Style meanValue NumberOfHomes
##        <int>      <fctr>     <dbl>         <int>
##  1      2006        1Fam  187539.7           505
##  2      2006      2fmCon  138752.9            17
##  3      2006      Duplex  137450.4            25
##  4      2006       Twnhs  126638.9            27
##  5      2006      TwnhsE  189787.8            51
##  6      2007        1Fam  188498.5           593
##  7      2007      2fmCon  132200.0            12
##  8      2007      Duplex  130990.2            20
##  9      2007       Twnhs  142266.1            18
## 10      2007      TwnhsE  194887.9            51
## # ... with 15 more rows

Dataset 2 - nycflights13 package Flights Data

head(airlines)
## # A tibble: 6 x 2
##   carrier                     name
##     <chr>                    <chr>
## 1      9E        Endeavor Air Inc.
## 2      AA   American Airlines Inc.
## 3      AS     Alaska Airlines Inc.
## 4      B6          JetBlue Airways
## 5      DL     Delta Air Lines Inc.
## 6      EV ExpressJet Airlines Inc.
head(planes)
## # A tibble: 6 x 9
##   tailnum  year                    type     manufacturer     model engines
##     <chr> <int>                   <chr>            <chr>     <chr>   <int>
## 1  N10156  2004 Fixed wing multi engine          EMBRAER EMB-145XR       2
## 2  N102UW  1998 Fixed wing multi engine AIRBUS INDUSTRIE  A320-214       2
## 3  N103US  1999 Fixed wing multi engine AIRBUS INDUSTRIE  A320-214       2
## 4  N104UW  1999 Fixed wing multi engine AIRBUS INDUSTRIE  A320-214       2
## 5  N10575  2002 Fixed wing multi engine          EMBRAER EMB-145LR       2
## 6  N105UW  1999 Fixed wing multi engine AIRBUS INDUSTRIE  A320-214       2
## # ... with 3 more variables: seats <int>, speed <int>, engine <chr>
head(flights)
## # A tibble: 6 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013     1     1      517            515         2      830
## 2  2013     1     1      533            529         4      850
## 3  2013     1     1      542            540         2      923
## 4  2013     1     1      544            545        -1     1004
## 5  2013     1     1      554            600        -6      812
## 6  2013     1     1      554            558        -4      740
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>
flightsDF <- flights
airlinesDF <- airlines
nrow(flightsDF)
## [1] 336776
# Histogram of the Arrival Delays - General
hist(flightsDF$arr_delay, main="Arrival Delays", xlab="Delay time in Mins", ylab="Number of Occurences")

# TidyData of JFK
flightsDFJFK <- filter(flightsDF, origin=='JFK')

# Flights to destinations starting from JFK
flightsDest <- group_by(flightsDFJFK, dest)
flightsDest <- summarize(flightsDest, number_flights = n())
flightsDest <- head(arrange(flightsDest,desc(number_flights)), n = 10)

# Plot
theme_set(theme_classic())
FlightsDestBar <- ggplot(flightsDest, aes(dest, number_flights), xlab="Destinations", ylab="Number of Flights")
FlightsDestBar + geom_bar(stat="identity", width = 0.5, fill="red") + 
  labs(title="JFK Traffic Chart", 
       subtitle="# of Flights to different Destinations") +
  theme(axis.text.x = element_text(angle=65, vjust=0.6))

Dataset 3 - CDC Data

CDCData = read.csv("http://www.openintro.org/stat/data/cdc.csv")
head(CDCData)
##     genhlth exerany hlthplan smoke100 height weight wtdesire age gender
## 1      good       0        1        0     70    175      175  77      m
## 2      good       0        1        1     64    125      115  33      f
## 3      good       1        1        1     60    105      105  49      f
## 4      good       1        1        0     66    132      124  42      f
## 5 very good       0        1        0     61    150      130  55      f
## 6 very good       1        1        0     64    114      114  55      f
# Plot the graph of the health condition of the smokers of age less than 30
age30AndLess <- filter(CDCData, age <= 30)
dataByHealthCondition <- group_by(CDCData, genhlth)
dataByHealthCondition <- summarize(dataByHealthCondition, count = n())
lbls <- c("Excellent", "Fair", "Good", "Poor", "Very Good")
pie3D(dataByHealthCondition$count, labels=lbls, explode=0.1, main="Pie Chart of Health Condition of People less than 30 years old")

# Plot the graph of the health condition of the smokers of age between 31 - 45
dataByHealthCondition <- group_by(filter(CDCData, age > 30, age <=45), genhlth)
dataByHealthCondition <- summarize(dataByHealthCondition, count = n())
lbls <- c("Excellent", "Fair", "Good", "Poor", "Very Good")
pie3D(dataByHealthCondition$count, labels=lbls, explode=0.1, main="Pie Chart of Health Condition of People > 30 and < 45 years old")

# Plot the graph of the health condition of the smokers of age above 45
dataByHealthCondition <- group_by(filter(CDCData, age > 45), genhlth)
dataByHealthCondition <- summarize(dataByHealthCondition, count = n())
lbls <- c("Excellent", "Fair", "Good", "Poor", "Very Good")
pie3D(dataByHealthCondition$count, labels=lbls, explode=0.1, main="Pie Chart of Health Condition of People > 45 years old")

# What proportion of the sample reports being in excellent health?
print(paste("Percent of people being in excellent health is ", round(nrow(count(CDCData[CDCData$genhlth == "excellent",]))  / nrow(CDCData) * 100, 2), "%"))
## [1] "Percent of people being in excellent health is  0 %"
# What is the percentage of Male vs Female smokers
maleSmokers <- filter(CDCData, gender=="m")
femaleSmokers <- filter(CDCData, gender=="f")
print(paste("Percentage of Male smokers compare to entire population is:", round(nrow(maleSmokers)/nrow(CDCData)*100, 2), "%"))
## [1] "Percentage of Male smokers compare to entire population is: 47.84 %"
print(paste("Percentage of Female smokers compare to entire population is:", round(nrow(femaleSmokers)/nrow(CDCData)*100, 2), "%"))
## [1] "Percentage of Female smokers compare to entire population is: 52.15 %"
# Number of people smoking 100 cigarette and still staying healthy
print(paste("Number of people smoking 100 cigarette and still staying healthy is", nrow(filter(CDCData, smoke100==1, genhlth=="excellent"))))
## [1] "Number of people smoking 100 cigarette and still staying healthy is 1778"
print(paste("Percentage to general population is: ", round(nrow(filter(CDCData, smoke100==1, genhlth=="excellent"))/nrow(CDCData)*100, 2), "%"))
## [1] "Percentage to general population is:  8.89 %"