For this project I am using the data feeds from the following two sites
1. http://ourairports.com/data.
2. https://www2.census.gov/programs-surveys/popest/datasets.
Using the R package dplyr hope to do staging of the data.
1.http://ourairports.com/data. - This site has most updated data about airports in the world. I do filter only the airports in USA
2.https://www2.census.gov/programs-surveys/popest/datasets. - USA Census web site
#load ourairports data from .scv files
#ourairports
#Loading airports.csv
airports_df <- read.csv("https://raw.githubusercontent.com/mathsanu/CUNY_MSDA/master/DATA606/FinalDataProject/Data/airports.csv", header=TRUE)
#airports_df <- read.csv("http://ourairports.com/data/airports.csv", header=TRUE)
#Loading countries.csv
countries_df <- read.csv("https://raw.githubusercontent.com/mathsanu/CUNY_MSDA/master/DATA606/FinalDataProject/Data/countries.csv", header=TRUE)
#airports_df <- read.csv("http://ourairports.com/data//countries.csv", header=TRUE)
#Loading regions.csv
regions_df <- read.csv("https://raw.githubusercontent.com/mathsanu/CUNY_MSDA/master/DATA606/FinalDataProject/Data/regions.csv", header=TRUE)
#airports_df <- read.csv("http://ourairports.com/data//regions.csv", header=TRUE)
#Loading runwayss.csv
runways_df <- read.csv("https://raw.githubusercontent.com/mathsanu/CUNY_MSDA/master/DATA606/FinalDataProject/Data/runways.csv", header=TRUE)
#airports_df <- read.csv("http://ourairports.com/data/runways.csv", header=TRUE)
#Loading US Populations data
Popult_df <- read.csv("https://www2.census.gov/programs-surveys/popest/datasets/2010-2016/national/totals/nst-est2016-popchg2010_2016.csv", header=TRUE)
head(airports_df)
## id ident type name
## 1 6523 00A heliport Total Rf Heliport
## 2 6524 00AK small_airport Lowell Field
## 3 6525 00AL small_airport Epps Airpark
## 4 6526 00AR closed Newport Hospital & Clinic Heliport
## 5 322127 00AS small_airport Fulton Airport
## 6 6527 00AZ small_airport Cordes Airport
## latitude_deg longitude_deg elevation_ft continent iso_country iso_region
## 1 40.0708 -74.93360 11 <NA> US US-PA
## 2 59.9492 -151.69600 450 <NA> US US-AK
## 3 34.8648 -86.77030 820 <NA> US US-AL
## 4 35.6087 -91.25490 237 <NA> US US-AR
## 5 34.9428 -97.81802 1100 <NA> US US-OK
## 6 34.3056 -112.16500 3810 <NA> US US-AZ
## municipality scheduled_service gps_code iata_code local_code home_link
## 1 Bensalem no 00A 00A
## 2 Anchor Point no 00AK 00AK
## 3 Harvest no 00AL 00AL
## 4 Newport no
## 5 Alex no 00AS 00AS
## 6 Cordes no 00AZ 00AZ
## wikipedia_link keywords
## 1
## 2
## 3
## 4 00AR
## 5
## 6
#head(countries_df)
head(regions_df)
## id code local_code name continent iso_country
## 1 302811 AD-02 02 Canillo EU AD
## 2 302812 AD-03 03 Encamp EU AD
## 3 302813 AD-04 04 La Massana EU AD
## 4 302814 AD-05 05 Ordino EU AD
## 5 302815 AD-06 06 Sant Juliàde Lòria EU AD
## 6 302816 AD-07 07 Andorra la Vella EU AD
## wikipedia_link keywords
## 1 http://en.wikipedia.org/wiki/Canillo
## 2 http://en.wikipedia.org/wiki/Encamp
## 3 http://en.wikipedia.org/wiki/La_Massana
## 4 http://en.wikipedia.org/wiki/Ordino
## 5 http://en.wikipedia.org/wiki/Sant_Julià_de_Lòria
## 6 http://en.wikipedia.org/wiki/Andorra_la_Vella
#head(runways_df)
head(Popult_df)
## SUMLEV REGION DIVISION STATE NAME ESTIMATESBASE2010
## 1 10 0 0 0 United States 308758105
## 2 20 1 0 0 Northeast Region 55318353
## 3 20 2 0 0 Midwest Region 66929825
## 4 20 3 0 0 South Region 114563005
## 5 20 4 0 0 West Region 71946922
## 6 40 3 6 1 Alabama 4780131
## POPESTIMATE2010 POPESTIMATE2011 POPESTIMATE2012 POPESTIMATE2013
## 1 309348193 311663358 313998379 316204908
## 2 55388056 55632766 55829059 55988771
## 3 66978602 67153331 67332320 67543948
## 4 114863114 116061801 117299171 118424320
## 5 72118421 72815460 73537829 74247869
## 6 4785492 4799918 4815960 4829479
## POPESTIMATE2014 POPESTIMATE2015 POPESTIMATE2016 NPOPCHG_2010
## 1 318563456 320896618 323127513 590088
## 2 56116791 56184737 56209510 69703
## 3 67726368 67838387 67941429 48777
## 4 119696311 121039206 122319574 300109
## 5 75023986 75834288 76657000 171499
## 6 4843214 4853875 4863300 5361
## NPOPCHG_2011 NPOPCHG_2012 NPOPCHG_2013 NPOPCHG_2014 NPOPCHG_2015
## 1 2315165 2335021 2206529 2358548 2333162
## 2 244710 196293 159712 128020 67946
## 3 174729 178989 211628 182420 112019
## 4 1198687 1237370 1125149 1271991 1342895
## 5 697039 722369 710040 776117 810302
## 6 14426 16042 13519 13735 10661
## NPOPCHG_2016 PPOPCHG_2010 PPOPCHG_2011 PPOPCHG_2012 PPOPCHG_2013
## 1 2230895 0.19111660 0.7484010 0.7492126 0.7027199
## 2 24773 0.12600339 0.4418101 0.3528370 0.2860732
## 3 103042 0.07287782 0.2608729 0.2665378 0.3143037
## 4 1280368 0.26195978 1.0435787 1.0661303 0.9592131
## 5 822712 0.23836878 0.9665201 0.9920544 0.9655439
## 6 9425 0.11215174 0.3014528 0.3342140 0.2807125
## PPOPCHG_2014 PPOPCHG_2015 PPOPCHG_2016 NRANK_ESTBASE2010
## 1 0.7458923 0.7324010 0.69520677 X
## 2 0.2286530 0.1210796 0.04409205 4
## 3 0.2700760 0.1653994 0.15189335 3
## 4 1.0740961 1.1219185 1.05781262 1
## 5 1.0453054 1.0800573 1.08488129 2
## 6 0.2843992 0.2201224 0.19417476 23
## NRANK_POPEST2010 NRANK_POPEST2011 NRANK_POPEST2012 NRANK_POPEST2013
## 1 X X X X
## 2 4 4 4 4
## 3 3 3 3 3
## 4 1 1 1 1
## 5 2 2 2 2
## 6 23 23 23 23
## NRANK_POPEST2014 NRANK_POPEST2015 NRANK_POPEST2016 NRANK_NPCHG2010
## 1 X X X X
## 2 4 4 4 3
## 3 3 3 3 4
## 4 1 1 1 1
## 5 2 2 2 2
## 6 23 24 24 30
## NRANK_NPCHG2011 NRANK_NPCHG2012 NRANK_NPCHG2013 NRANK_NPCHG2014
## 1 X X X X
## 2 3 3 4 4
## 3 4 4 3 3
## 4 1 1 1 1
## 5 2 2 2 2
## 6 31 26 32 30
## NRANK_NPCHG2015 NRANK_NPCHG2016 NRANK_PPCHG2010 NRANK_PPCHG2011
## 1 X X X X
## 2 4 4 3 3
## 3 3 3 4 4
## 4 1 1 1 1
## 5 2 2 2 2
## 6 31 31 38 39
## NRANK_PPCHG2012 NRANK_PPCHG2013 NRANK_PPCHG2014 NRANK_PPCHG2015
## 1 X X X X
## 2 3 4 4 4
## 3 4 3 3 3
## 4 1 2 1 1
## 5 2 1 2 2
## 6 36 37 34 35
## NRANK_PPCHG2016
## 1 X
## 2 4
## 3 3
## 4 2
## 5 1
## 6 34
#distinct_df = airports_df %>% distinct(type)
#distinct_df
usairports_df = airports_df %>% filter(iso_country == "US")
head(usairports_df)
## id ident type name
## 1 6523 00A heliport Total Rf Heliport
## 2 6524 00AK small_airport Lowell Field
## 3 6525 00AL small_airport Epps Airpark
## 4 6526 00AR closed Newport Hospital & Clinic Heliport
## 5 322127 00AS small_airport Fulton Airport
## 6 6527 00AZ small_airport Cordes Airport
## latitude_deg longitude_deg elevation_ft continent iso_country iso_region
## 1 40.0708 -74.93360 11 <NA> US US-PA
## 2 59.9492 -151.69600 450 <NA> US US-AK
## 3 34.8648 -86.77030 820 <NA> US US-AL
## 4 35.6087 -91.25490 237 <NA> US US-AR
## 5 34.9428 -97.81802 1100 <NA> US US-OK
## 6 34.3056 -112.16500 3810 <NA> US US-AZ
## municipality scheduled_service gps_code iata_code local_code home_link
## 1 Bensalem no 00A 00A
## 2 Anchor Point no 00AK 00AK
## 3 Harvest no 00AL 00AL
## 4 Newport no
## 5 Alex no 00AS 00AS
## 6 Cordes no 00AZ 00AZ
## wikipedia_link keywords
## 1
## 2
## 3
## 4 00AR
## 5
## 6
usaregions_df = regions_df%>% filter(iso_country == "US")
head(usaregions_df)
## id code local_code name continent iso_country
## 1 306076 US-AK AK Alaska <NA> US
## 2 306077 US-AL AL Alabama <NA> US
## 3 306078 US-AR AR Arkansas <NA> US
## 4 306079 US-AZ AZ Arizona <NA> US
## 5 306080 US-CA CA California <NA> US
## 6 306081 US-CO CO Colorado <NA> US
## wikipedia_link keywords
## 1 http://en.wikipedia.org/wiki/Alaska
## 2 http://en.wikipedia.org/wiki/Alabama
## 3 http://en.wikipedia.org/wiki/Arkansas
## 4 http://en.wikipedia.org/wiki/Arizona
## 5 http://en.wikipedia.org/wiki/California
## 6 http://en.wikipedia.org/wiki/Colorado
usaPopult_df = Popult_df%>% filter(STATE != 0)
head(usaPopult_df)
## SUMLEV REGION DIVISION STATE NAME ESTIMATESBASE2010
## 1 40 3 6 1 Alabama 4780131
## 2 40 4 9 2 Alaska 710249
## 3 40 4 8 4 Arizona 6392301
## 4 40 3 7 5 Arkansas 2916025
## 5 40 4 9 6 California 37254522
## 6 40 4 8 8 Colorado 5029324
## POPESTIMATE2010 POPESTIMATE2011 POPESTIMATE2012 POPESTIMATE2013
## 1 4785492 4799918 4815960 4829479
## 2 714031 722713 731089 736879
## 3 6408312 6467163 6549634 6624617
## 4 2921995 2939493 2950685 2958663
## 5 37332685 37676861 38011074 38335203
## 6 5048644 5118360 5189867 5267603
## POPESTIMATE2014 POPESTIMATE2015 POPESTIMATE2016 NPOPCHG_2010
## 1 4843214 4853875 4863300 5361
## 2 736705 737709 741894 3782
## 3 6719993 6817565 6931071 16011
## 4 2966912 2977853 2988248 5970
## 5 38680810 38993940 39250017 78163
## 6 5349648 5448819 5540545 19320
## NPOPCHG_2011 NPOPCHG_2012 NPOPCHG_2013 NPOPCHG_2014 NPOPCHG_2015
## 1 14426 16042 13519 13735 10661
## 2 8682 8376 5790 -174 1004
## 3 58851 82471 74983 95376 97572
## 4 17498 11192 7978 8249 10941
## 5 344176 334213 324129 345607 313130
## 6 69716 71507 77736 82045 99171
## NPOPCHG_2016 PPOPCHG_2010 PPOPCHG_2011 PPOPCHG_2012 PPOPCHG_2013
## 1 9425 0.1121517 0.3014528 0.3342140 0.2807125
## 2 4185 0.5324893 1.2159136 1.1589663 0.7919692
## 3 113506 0.2504732 0.9183542 1.2752269 1.1448426
## 4 10395 0.2047308 0.5988374 0.3807459 0.2703779
## 5 256077 0.2098081 0.9219160 0.8870511 0.8527226
## 6 91726 0.3841471 1.3808856 1.3970686 1.4978419
## PPOPCHG_2014 PPOPCHG_2015 PPOPCHG_2016 NRANK_ESTBASE2010
## 1 0.2843992 0.2201224 0.1941748 23
## 2 -0.0236131 0.1362825 0.5672969 47
## 3 1.4397210 1.4519658 1.6649053 16
## 4 0.2788084 0.3687673 0.3490770 32
## 5 0.9015395 0.8095229 0.6567097 1
## 6 1.5575395 1.8537855 1.6834107 22
## NRANK_POPEST2010 NRANK_POPEST2011 NRANK_POPEST2012 NRANK_POPEST2013
## 1 23 23 23 23
## 2 47 47 47 47
## 3 16 16 15 15
## 4 32 32 32 32
## 5 1 1 1 1
## 6 22 22 22 22
## NRANK_POPEST2014 NRANK_POPEST2015 NRANK_POPEST2016 NRANK_NPCHG2010
## 1 23 24 24 30
## 2 48 48 48 33
## 3 15 14 14 11
## 4 32 33 33 26
## 5 1 1 1 2
## 6 22 22 21 8
## NRANK_NPCHG2011 NRANK_NPCHG2012 NRANK_NPCHG2013 NRANK_NPCHG2014
## 1 31 26 32 30
## 2 39 41 43 47
## 3 10 7 6 5
## 4 26 34 40 38
## 5 2 2 2 2
## 6 9 10 5 8
## NRANK_NPCHG2015 NRANK_NPCHG2016 NRANK_PPCHG2010 NRANK_PPCHG2011
## 1 31 31 38 39
## 2 44 37 2 7
## 3 8 5 16 16
## 4 29 30 25 30
## 5 3 3 23 15
## 6 7 8 5 5
## NRANK_PPCHG2012 NRANK_PPCHG2013 NRANK_PPCHG2014 NRANK_PPCHG2015
## 1 36 37 34 35
## 2 11 21 46 38
## 3 8 9 7 9
## 4 32 38 35 30
## 5 19 20 17 17
## 6 6 5 5 2
## NRANK_PPCHG2016
## 1 34
## 2 21
## 3 8
## 4 28
## 5 20
## 6 7
df1<-inner_join(usairports_df,usaregions_df, by = c("iso_region"="code"))
## Warning in inner_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining
## factors with different levels, coercing to character vector
allairports_df <- arrange(select(df1,Region=iso_region,Municipality = municipality,State=name.y,AirPortName=name.x,AirPortType=type),(State))
head(allairports_df)
## Region Municipality State AirPortName AirPortType
## 1 US-U-A (unassigned) atl large_airport
## 2 US-U-A Austin (unassigned) Erase Me 16 closed
## 3 US-U-A Greenfield (unassigned) Ronnie Cole small_airport
## 4 US-U-A (unassigned) Erase Me 14 closed
## 5 US-U-A Baker (unassigned) Beacon Station Air Strip closed
## 6 US-AL Harvest Alabama Epps Airpark small_airport
allairports_types_bystat_df <- allairports_df %>%
group_by(State,AirPortType) %>%
summarise(airportcount = n())%>%
arrange%>%
filter(State != '(unassigned)')
head(allairports_types_bystat_df)
## Source: local data frame [6 x 3]
## Groups: State [1]
##
## State AirPortType airportcount
## <fctr> <fctr> <int>
## 1 Alabama closed 5
## 2 Alabama heliport 103
## 3 Alabama large_airport 4
## 4 Alabama medium_airport 11
## 5 Alabama seaplane_base 7
## 6 Alabama small_airport 183
allairports_bystat_df <- allairports_df %>%
group_by(State) %>%
summarise(airportcount = n())%>%
arrange %>%
filter(State != '(unassigned)')
head(allairports_bystat_df)
## # A tibble: 6 × 2
## State airportcount
## <fctr> <int>
## 1 Alabama 313
## 2 Alaska 796
## 3 Arizona 349
## 4 Arkansas 385
## 5 California 1066
## 6 Colorado 500
allpopu_bystat_df <- arrange(select(usaPopult_df,Region=NAME,Population = POPESTIMATE2016) ,(Region))
head(allpopu_bystat_df)
## Region Population
## 1 Alabama 4863300
## 2 Alaska 741894
## 3 Arizona 6931071
## 4 Arkansas 2988248
## 5 California 39250017
## 6 Colorado 5540545
allairports_allpopu_bystat_df<-left_join(allairports_bystat_df,allpopu_bystat_df, by = c("State"="Region"))
## Warning in left_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining
## factors with different levels, coercing to character vector
allairports_allpopu_bystat_df <-allairports_allpopu_bystat_df%>%
select(State,airportcount,Population) %>%
mutate(Popu_10k = Population / 10000)
df3 <- left_join(allairports_types_bystat_df,allpopu_bystat_df, by = c("State"="Region"))
## Warning in left_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining
## factors with different levels, coercing to character vector
# allairports_types_allpopu_bystat_df <- select(df3,State,AirPortType,airportcount,Population,avgpop_airport = (Population/ airportcount))
# allairports_types_allpopu_bystat_df
allairports_types_allpopu_bystat_df<- df3 %>%
select(State,AirPortType,airportcount,Population) %>%
mutate(avgpop_airport = Population / airportcount)
summary(allairports_types_allpopu_bystat_df )
## State AirPortType airportcount
## Length:297 balloonport :11 Min. : 1.00
## Class :character closed :50 1st Qu.: 4.00
## Mode :character heliport :51 Median : 12.00
## large_airport :46 Mean : 73.82
## medium_airport:50 3rd Qu.: 73.00
## seaplane_base :39 Max. :1516.00
## small_airport :50
## Population avgpop_airport
## Min. : 585501 Min. : 1520
## 1st Qu.: 1831102 1st Qu.: 50530
## Median : 4863300 Median : 403361
## Mean : 6585385 Mean : 1031866
## 3rd Qu.: 8411808 3rd Qu.: 1056426
## Max. :39250017 Max. :19745289
##
summary(allairports_allpopu_bystat_df)
## State airportcount Population Popu_10k
## Length:51 Min. : 21.0 Min. : 585501 Min. : 58.55
## Class :character 1st Qu.: 194.5 1st Qu.: 1757121 1st Qu.: 175.71
## Mode :character Median : 348.0 Median : 4436974 Median : 443.70
## Mean : 429.9 Mean : 6335834 Mean : 633.58
## 3rd Qu.: 553.0 3rd Qu.: 7109536 3rd Qu.: 710.95
## Max. :2184.0 Max. :39250017 Max. :3925.00
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.5
allairports_allpopu_bystat_df.lm1 <- lm(airportcount ~ Popu_10k, data = allairports_allpopu_bystat_df)
plot(allairports_allpopu_bystat_df$airportcount ~ allairports_allpopu_bystat_df$Popu_10k, main = "Relationship between State Population vs Airports",xlab='Population in States(10Ks)',ylab='Airport Count')
abline(allairports_allpopu_bystat_df.lm1 )
abline(h=429,col = "red")
abline(v=633,col = "blue" )
The above graph suggests that Airport Count increases linearly with Population in States(10Ks) so I will fit a simple linear regression model to the data model with the mean of Airport Count and Population in States(10Ks).
summary(allairports_allpopu_bystat_df.lm1)
##
## Call:
## lm(formula = airportcount ~ Popu_10k, data = allairports_allpopu_bystat_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -610.92 -122.78 -45.81 115.49 938.51
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 189.87966 42.11856 4.508 4.08e-05 ***
## Popu_10k 0.37886 0.04401 8.608 2.28e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 225.4 on 49 degrees of freedom
## Multiple R-squared: 0.602, Adjusted R-squared: 0.5938
## F-statistic: 74.11 on 1 and 49 DF, p-value: 2.277e-11
We get a lot of useful information here without being too overwhelmed by pages of output.
The estimates for the model intercept is 189.87966 and the coefficient measuring the slope of the relationship with Popu_10k is 0.37886 and information about standard errors of these estimates is also provided in the Coefficients table. We see that the test of significance of the model coefficients is also summarised in that table so we can see that there is evidence that the coefficient is significantly different to zero - as the population increases so does airports. It proves Positive Correlation between the number of airports and state population.
library(ggplot2)
ggplot(allairports_allpopu_bystat_df, aes(fill=airportcount, y=airportcount, x=State)) + ggtitle("Total Airport Count in each State ") + geom_bar( stat="identity") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + geom_point()
ggplot(allairports_types_allpopu_bystat_df, aes(fill=AirPortType, y=airportcount, x=State)) + ggtitle("Airport Type Count in each State ") + geom_bar( stat="identity") + theme(axis.text.x = element_text(angle = 70, hjust = 1))
datn1 <- allpopu_bystat_df %>% mutate(Popu_10k = Population / 10000) %>% arrange(Popu_10k)
ggplot(data=datn1, aes(y=Popu_10k,x=Region, colour=Region)) + theme(axis.text.x = element_text(angle = 90, hjust = 1), legend.position="none") +
geom_point() +
geom_bar(stat="identity") +
ggtitle("Population in each State (10k )")