Load required packages
knitr::opts_chunk$set(echo = TRUE)
# References
# ggplot types reference - http://r-statistics.co/Top50-Ggplot2-Visualizations-MasterList-R-Code.html#Bar%20Chart
# Clear the console
cat("\014")
# Load the library
# Check if the package is installed. If not, install the package
if(!require('tidyr')) {
install.packages('tidyr')
library(tidyr)
}
## Loading required package: tidyr
## Warning: package 'tidyr' was built under R version 3.3.3
# Check if the package is installed. If not, install the package
if(!require('dplyr')) {
install.packages('dplyr')
library(dplyr)
}
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 3.3.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Check if the package is installed. If not, install the package
if(!require('ggplot2')) {
install.packages('ggplot2')
library(ggplot2)
}
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.3.3
# Flights data used for this Project
# Check if the package is installed. If not, install the package
if(!require('nycflights13')) {
install.packages('nycflights13')
library(nycflights13)
}
## Loading required package: nycflights13
## Warning: package 'nycflights13' was built under R version 3.3.3
# Check if the package is installed. If not, install the package
if(!require('reshape2')) {
install.packages('reshape2')
library(reshape2)
}
## Loading required package: reshape2
## Warning: package 'reshape2' was built under R version 3.3.3
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
# Check if the package is installed. If not, install the package
if(!require('plotrix')) {
install.packages('plotrix')
library(plotrix)
}
## Loading required package: plotrix
## Warning: package 'plotrix' was built under R version 3.3.3
Dataset 1 - AMES Data
#DATASET 1 - AMES Dataset
amesData <- read.csv("https://raw.githubusercontent.com/kalyanparthasarathy/DATA607/master/AMES%20RESIDENTIAL%20HOME%20SALES%20(AMES).csv")
head(amesData)
## Order PID MS.SubClass MS.Zoning Lot.Frontage Lot.Area Street Alley
## 1 1 526301100 20 RL 141 31770 Pave <NA>
## 2 2 526350040 20 RH 80 11622 Pave <NA>
## 3 3 526351010 20 RL 81 14267 Pave <NA>
## 4 4 526353030 20 RL 93 11160 Pave <NA>
## 5 5 527105010 60 RL 74 13830 Pave <NA>
## 6 6 527105030 60 RL 78 9978 Pave <NA>
## Lot.Shape Land.Contour Utilities Lot.Config Land.Slope Neighborhood
## 1 IR1 Lvl AllPub Corner Gtl NAmes
## 2 Reg Lvl AllPub Inside Gtl NAmes
## 3 IR1 Lvl AllPub Corner Gtl NAmes
## 4 Reg Lvl AllPub Corner Gtl NAmes
## 5 IR1 Lvl AllPub Inside Gtl Gilbert
## 6 IR1 Lvl AllPub Inside Gtl Gilbert
## Condition.1 Condition.2 Bldg.Type House.Style Overall.Qual Overall.Cond
## 1 Norm Norm 1Fam 1Story 6 5
## 2 Feedr Norm 1Fam 1Story 5 6
## 3 Norm Norm 1Fam 1Story 6 6
## 4 Norm Norm 1Fam 1Story 7 5
## 5 Norm Norm 1Fam 2Story 5 5
## 6 Norm Norm 1Fam 2Story 6 6
## Year.Built Year.Remod.Add Roof.Style Roof.Matl Exterior.1st Exterior.2nd
## 1 1960 1960 Hip CompShg BrkFace Plywood
## 2 1961 1961 Gable CompShg VinylSd VinylSd
## 3 1958 1958 Hip CompShg Wd Sdng Wd Sdng
## 4 1968 1968 Hip CompShg BrkFace BrkFace
## 5 1997 1998 Gable CompShg VinylSd VinylSd
## 6 1998 1998 Gable CompShg VinylSd VinylSd
## Mas.Vnr.Type Mas.Vnr.Area Exter.Qual Exter.Cond Foundation Bsmt.Qual
## 1 Stone 112 TA TA CBlock TA
## 2 None 0 TA TA CBlock TA
## 3 BrkFace 108 TA TA CBlock TA
## 4 None 0 Gd TA CBlock TA
## 5 None 0 TA TA PConc Gd
## 6 BrkFace 20 TA TA PConc TA
## Bsmt.Cond Bsmt.Exposure BsmtFin.Type.1 BsmtFin.SF.1 BsmtFin.Type.2
## 1 Gd Gd BLQ 639 Unf
## 2 TA No Rec 468 LwQ
## 3 TA No ALQ 923 Unf
## 4 TA No ALQ 1065 Unf
## 5 TA No GLQ 791 Unf
## 6 TA No GLQ 602 Unf
## BsmtFin.SF.2 Bsmt.Unf.SF Total.Bsmt.SF Heating Heating.QC Central.Air
## 1 0 441 1080 GasA Fa Y
## 2 144 270 882 GasA TA Y
## 3 0 406 1329 GasA TA Y
## 4 0 1045 2110 GasA Ex Y
## 5 0 137 928 GasA Gd Y
## 6 0 324 926 GasA Ex Y
## Electrical X1st.Flr.SF X2nd.Flr.SF Low.Qual.Fin.SF Gr.Liv.Area
## 1 SBrkr 1656 0 0 1656
## 2 SBrkr 896 0 0 896
## 3 SBrkr 1329 0 0 1329
## 4 SBrkr 2110 0 0 2110
## 5 SBrkr 928 701 0 1629
## 6 SBrkr 926 678 0 1604
## Bsmt.Full.Bath Bsmt.Half.Bath Full.Bath Half.Bath Bedroom.AbvGr
## 1 1 0 1 0 3
## 2 0 0 1 0 2
## 3 0 0 1 1 3
## 4 1 0 2 1 3
## 5 0 0 2 1 3
## 6 0 0 2 1 3
## Kitchen.AbvGr Kitchen.Qual TotRms.AbvGrd Functional Fireplaces
## 1 1 TA 7 Typ 2
## 2 1 TA 5 Typ 0
## 3 1 Gd 6 Typ 0
## 4 1 Ex 8 Typ 2
## 5 1 TA 6 Typ 1
## 6 1 Gd 7 Typ 1
## Fireplace.Qu Garage.Type Garage.Yr.Blt Garage.Finish Garage.Cars
## 1 Gd Attchd 1960 Fin 2
## 2 <NA> Attchd 1961 Unf 1
## 3 <NA> Attchd 1958 Unf 1
## 4 TA Attchd 1968 Fin 2
## 5 TA Attchd 1997 Fin 2
## 6 Gd Attchd 1998 Fin 2
## Garage.Area Garage.Qual Garage.Cond Paved.Drive Wood.Deck.SF
## 1 528 TA TA P 210
## 2 730 TA TA Y 140
## 3 312 TA TA Y 393
## 4 522 TA TA Y 0
## 5 482 TA TA Y 212
## 6 470 TA TA Y 360
## Open.Porch.SF Enclosed.Porch X3Ssn.Porch Screen.Porch Pool.Area Pool.QC
## 1 62 0 0 0 0 <NA>
## 2 0 0 0 120 0 <NA>
## 3 36 0 0 0 0 <NA>
## 4 0 0 0 0 0 <NA>
## 5 34 0 0 0 0 <NA>
## 6 36 0 0 0 0 <NA>
## Fence Misc.Feature Misc.Val Mo.Sold Yr.Sold Sale.Type Sale.Condition
## 1 <NA> <NA> 0 5 2010 WD Normal
## 2 MnPrv <NA> 0 6 2010 WD Normal
## 3 <NA> Gar2 12500 6 2010 WD Normal
## 4 <NA> <NA> 0 4 2010 WD Normal
## 5 MnPrv <NA> 0 3 2010 WD Normal
## 6 <NA> <NA> 0 6 2010 WD Normal
## SalePrice
## 1 215000
## 2 105000
## 3 172000
## 4 244000
## 5 189900
## 6 195500
# 1. Find the Mean Sale price for each year for each house type
amesDataForAnalysis <- amesData[, c(1, 2, 17, 48, 21, 79, 82)]
names(amesDataForAnalysis) <- c("S.No", "PID", "House_Style", "Living_Area_Size", "Year_Built", "Year_Sold", "Sale_Price")
amesDataMean <- group_by(amesDataForAnalysis, Year_Sold, House_Style)
amesDataMeanSummary <- summarise(amesDataMean, meanValue = mean(Sale_Price), NumberOfHomes = n())
# Mean price for all house types
amesDataMeanAll <- group_by(amesDataForAnalysis, Year_Sold)
amesDataMeanSummaryAll <- summarise(amesDataMeanAll, meanValue = mean(Sale_Price), NumberOfHomes = n())
amesDataMeanSummaryAll
## # A tibble: 5 x 3
## Year_Sold meanValue NumberOfHomes
## <int> <dbl> <int>
## 1 2006 181761.6 625
## 2 2007 185138.2 694
## 3 2008 178841.8 622
## 4 2009 181404.6 648
## 5 2010 172597.6 341
# 2. Draw a comaprison graph for Mean price by year
theme_set(theme_classic())
# Tidy dataframe for Mean Summary for all house types
amesDataMeanSummaryAllPlot <- ggplot(amesDataMeanSummaryAll, aes(Year_Sold, meanValue))
amesDataMeanSummaryAllPlot +
geom_bar(stat="identity", width = 0.2, fill="tomato2") +
labs(title="Mean Price Bar Chart",
subtitle="All Home sold between 2006 - 2010",
caption="Source: AMES Dataset from OpenIntro Statistics website") +
theme(axis.text.x = element_text(angle=65, vjust=0.6))

# Mean Summary for each house type
# 1Fam TidyData
amesDataMeanSummaryPlot <- ggplot(filter(amesDataMeanSummary, House_Style=='1Fam'), aes(Year_Sold, meanValue))
## Warning: package 'bindrcpp' was built under R version 3.3.3
amesDataMeanSummaryPlot +
geom_bar(stat="identity", width = 0.2, fill="tomato2") +
labs(title="Mean Price Bar Chart",
subtitle="All 1Fam Home sold between 2006 - 2010",
caption="Source: AMES Dataset from OpenIntro Statistics website") +
theme(axis.text.x = element_text(angle=65, vjust=0.6))

# 2fmCon TidyData
amesDataMeanSummaryPlot <- ggplot(filter(amesDataMeanSummary, House_Style=='2fmCon'), aes(Year_Sold, meanValue))
amesDataMeanSummaryPlot +
geom_bar(stat="identity", width = 0.2, fill="tomato2") +
labs(title="Mean Price Bar Chart",
subtitle="All 2fmCon Home sold between 2006 - 2010",
caption="Source: AMES Dataset from OpenIntro Statistics website") +
theme(axis.text.x = element_text(angle=65, vjust=0.6))

# Duplex
amesDataMeanSummaryPlot <- ggplot(filter(amesDataMeanSummary, House_Style=='Duplex'), aes(Year_Sold, meanValue))
amesDataMeanSummaryPlot +
geom_bar(stat="identity", width = 0.2, fill="tomato2") +
labs(title="Mean Price Bar Chart",
subtitle="All Duplex Home sold between 2006 - 2010",
caption="Source: AMES Dataset from OpenIntro Statistics website") +
theme(axis.text.x = element_text(angle=65, vjust=0.6))

# Twnhs
amesDataMeanSummaryPlot <- ggplot(filter(amesDataMeanSummary, House_Style=='Twnhs'), aes(Year_Sold, meanValue))
amesDataMeanSummaryPlot +
geom_bar(stat="identity", width = 0.2, fill="tomato2") +
labs(title="Mean Price Bar Chart",
subtitle="All Twnhs Home sold between 2006 - 2010",
caption="Source: AMES Dataset from OpenIntro Statistics website") +
theme(axis.text.x = element_text(angle=65, vjust=0.6))

# TwnhsE
amesDataMeanSummaryPlot <- ggplot(filter(amesDataMeanSummary, House_Style=='TwnhsE'), aes(Year_Sold, meanValue))
amesDataMeanSummaryPlot +
geom_bar(stat="identity", width = 0.2, fill="tomato2") +
labs(title="Mean Price Bar Chart",
subtitle="All TwnhsE Home sold between 2006 - 2010",
caption="Source: AMES Dataset from OpenIntro Statistics website") +
theme(axis.text.x = element_text(angle=65, vjust=0.6))

# 3. Sales record count by home type (1 story, 1.5 story etc.)
# Mean price by house type
amesDataMean <- group_by(amesDataForAnalysis, Year_Sold, House_Style)
amesDataMeanSummary <- summarise(amesDataMean, meanValue = mean(Sale_Price), NumberOfHomes = n())
amesDataMeanSummary
## # A tibble: 25 x 4
## # Groups: Year_Sold [?]
## Year_Sold House_Style meanValue NumberOfHomes
## <int> <fctr> <dbl> <int>
## 1 2006 1Fam 187539.7 505
## 2 2006 2fmCon 138752.9 17
## 3 2006 Duplex 137450.4 25
## 4 2006 Twnhs 126638.9 27
## 5 2006 TwnhsE 189787.8 51
## 6 2007 1Fam 188498.5 593
## 7 2007 2fmCon 132200.0 12
## 8 2007 Duplex 130990.2 20
## 9 2007 Twnhs 142266.1 18
## 10 2007 TwnhsE 194887.9 51
## # ... with 15 more rows
Dataset 2 - nycflights13 package Flights Data
head(airlines)
## # A tibble: 6 x 2
## carrier name
## <chr> <chr>
## 1 9E Endeavor Air Inc.
## 2 AA American Airlines Inc.
## 3 AS Alaska Airlines Inc.
## 4 B6 JetBlue Airways
## 5 DL Delta Air Lines Inc.
## 6 EV ExpressJet Airlines Inc.
head(planes)
## # A tibble: 6 x 9
## tailnum year type manufacturer model engines
## <chr> <int> <chr> <chr> <chr> <int>
## 1 N10156 2004 Fixed wing multi engine EMBRAER EMB-145XR 2
## 2 N102UW 1998 Fixed wing multi engine AIRBUS INDUSTRIE A320-214 2
## 3 N103US 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214 2
## 4 N104UW 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214 2
## 5 N10575 2002 Fixed wing multi engine EMBRAER EMB-145LR 2
## 6 N105UW 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214 2
## # ... with 3 more variables: seats <int>, speed <int>, engine <chr>
head(flights)
## # A tibble: 6 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## # time_hour <dttm>
flightsDF <- flights
airlinesDF <- airlines
nrow(flightsDF)
## [1] 336776
# Histogram of the Arrival Delays - General
hist(flightsDF$arr_delay, main="Arrival Delays", xlab="Delay time in Mins", ylab="Number of Occurences")

# TidyData of JFK
flightsDFJFK <- filter(flightsDF, origin=='JFK')
# Flights to destinations starting from JFK
flightsDest <- group_by(flightsDFJFK, dest)
flightsDest <- summarize(flightsDest, number_flights = n())
flightsDest <- head(arrange(flightsDest,desc(number_flights)), n = 10)
# Plot
theme_set(theme_classic())
FlightsDestBar <- ggplot(flightsDest, aes(dest, number_flights), xlab="Destinations", ylab="Number of Flights")
FlightsDestBar + geom_bar(stat="identity", width = 0.5, fill="red") +
labs(title="JFK Traffic Chart",
subtitle="# of Flights to different Destinations") +
theme(axis.text.x = element_text(angle=65, vjust=0.6))

Dataset 3 - CDC Data
CDCData = read.csv("http://www.openintro.org/stat/data/cdc.csv")
head(CDCData)
## genhlth exerany hlthplan smoke100 height weight wtdesire age gender
## 1 good 0 1 0 70 175 175 77 m
## 2 good 0 1 1 64 125 115 33 f
## 3 good 1 1 1 60 105 105 49 f
## 4 good 1 1 0 66 132 124 42 f
## 5 very good 0 1 0 61 150 130 55 f
## 6 very good 1 1 0 64 114 114 55 f
# Plot the graph of the health condition of the smokers of age less than 30
age30AndLess <- filter(CDCData, age <= 30)
dataByHealthCondition <- group_by(CDCData, genhlth)
dataByHealthCondition <- summarize(dataByHealthCondition, count = n())
lbls <- c("Excellent", "Fair", "Good", "Poor", "Very Good")
pie3D(dataByHealthCondition$count, labels=lbls, explode=0.1, main="Pie Chart of Health Condition of People less than 30 years old")

# Plot the graph of the health condition of the smokers of age between 31 - 45
dataByHealthCondition <- group_by(filter(CDCData, age > 30, age <=45), genhlth)
dataByHealthCondition <- summarize(dataByHealthCondition, count = n())
lbls <- c("Excellent", "Fair", "Good", "Poor", "Very Good")
pie3D(dataByHealthCondition$count, labels=lbls, explode=0.1, main="Pie Chart of Health Condition of People > 30 and < 45 years old")

# Plot the graph of the health condition of the smokers of age above 45
dataByHealthCondition <- group_by(filter(CDCData, age > 45), genhlth)
dataByHealthCondition <- summarize(dataByHealthCondition, count = n())
lbls <- c("Excellent", "Fair", "Good", "Poor", "Very Good")
pie3D(dataByHealthCondition$count, labels=lbls, explode=0.1, main="Pie Chart of Health Condition of People > 45 years old")

# What proportion of the sample reports being in excellent health?
print(paste("Percent of people being in excellent health is ", round(nrow(count(CDCData[CDCData$genhlth == "excellent",])) / nrow(CDCData) * 100, 2), "%"))
## [1] "Percent of people being in excellent health is 0 %"
# What is the percentage of Male vs Female smokers
maleSmokers <- filter(CDCData, gender=="m")
femaleSmokers <- filter(CDCData, gender=="f")
print(paste("Percentage of Male smokers compare to entire population is:", round(nrow(maleSmokers)/nrow(CDCData)*100, 2), "%"))
## [1] "Percentage of Male smokers compare to entire population is: 47.84 %"
print(paste("Percentage of Female smokers compare to entire population is:", round(nrow(femaleSmokers)/nrow(CDCData)*100, 2), "%"))
## [1] "Percentage of Female smokers compare to entire population is: 52.15 %"
# Number of people smoking 100 cigarette and still staying healthy
print(paste("Number of people smoking 100 cigarette and still staying healthy is", nrow(filter(CDCData, smoke100==1, genhlth=="excellent"))))
## [1] "Number of people smoking 100 cigarette and still staying healthy is 1778"
print(paste("Percentage to general population is: ", round(nrow(filter(CDCData, smoke100==1, genhlth=="excellent"))/nrow(CDCData)*100, 2), "%"))
## [1] "Percentage to general population is: 8.89 %"