Research question

What are the cases, and how many are there?

Describe the method of data collection:

What type of study is this:

Data Source:

1.http://ourairports.com/data. - This site has most updated data about airports in the world. I do filter only the airports in USA

2.https://www2.census.gov/programs-surveys/popest/datasets. - USA Census web site

Response:

Explanatory:

Relevant summary statistics

#load ourairports data from .scv files
#ourairports

#Loading airports.csv
airports_df <- read.csv("https://raw.githubusercontent.com/mathsanu/CUNY_MSDA/master/DATA606/FinalDataProject/Data/airports.csv", header=TRUE)
#airports_df <- read.csv("http://ourairports.com/data/airports.csv", header=TRUE)

#Loading countries.csv
countries_df <- read.csv("https://raw.githubusercontent.com/mathsanu/CUNY_MSDA/master/DATA606/FinalDataProject/Data/countries.csv", header=TRUE)
#airports_df <- read.csv("http://ourairports.com/data//countries.csv", header=TRUE)
                          

#Loading regions.csv
regions_df <- read.csv("https://raw.githubusercontent.com/mathsanu/CUNY_MSDA/master/DATA606/FinalDataProject/Data/regions.csv", header=TRUE)
#airports_df <- read.csv("http://ourairports.com/data//regions.csv", header=TRUE)

#Loading runwayss.csv
runways_df <- read.csv("https://raw.githubusercontent.com/mathsanu/CUNY_MSDA/master/DATA606/FinalDataProject/Data/runways.csv", header=TRUE)
#airports_df <- read.csv("http://ourairports.com/data/runways.csv", header=TRUE)

#Loading US Populations data
Popult_df <- read.csv("https://www2.census.gov/programs-surveys/popest/datasets/2010-2016/national/totals/nst-est2016-popchg2010_2016.csv", header=TRUE)

head(airports_df)
##       id ident          type                               name
## 1   6523   00A      heliport                  Total Rf Heliport
## 2   6524  00AK small_airport                       Lowell Field
## 3   6525  00AL small_airport                       Epps Airpark
## 4   6526  00AR        closed Newport Hospital & Clinic Heliport
## 5 322127  00AS small_airport                     Fulton Airport
## 6   6527  00AZ small_airport                     Cordes Airport
##   latitude_deg longitude_deg elevation_ft continent iso_country iso_region
## 1      40.0708     -74.93360           11      <NA>          US      US-PA
## 2      59.9492    -151.69600          450      <NA>          US      US-AK
## 3      34.8648     -86.77030          820      <NA>          US      US-AL
## 4      35.6087     -91.25490          237      <NA>          US      US-AR
## 5      34.9428     -97.81802         1100      <NA>          US      US-OK
## 6      34.3056    -112.16500         3810      <NA>          US      US-AZ
##   municipality scheduled_service gps_code iata_code local_code home_link
## 1     Bensalem                no      00A                  00A          
## 2 Anchor Point                no     00AK                 00AK          
## 3      Harvest                no     00AL                 00AL          
## 4      Newport                no                                        
## 5         Alex                no     00AS                 00AS          
## 6       Cordes                no     00AZ                 00AZ          
##   wikipedia_link keywords
## 1                        
## 2                        
## 3                        
## 4                    00AR
## 5                        
## 6
#head(countries_df)
head(regions_df)
##       id  code local_code                  name continent iso_country
## 1 302811 AD-02         02               Canillo        EU          AD
## 2 302812 AD-03         03                Encamp        EU          AD
## 3 302813 AD-04         04            La Massana        EU          AD
## 4 302814 AD-05         05                Ordino        EU          AD
## 5 302815 AD-06         06 Sant Julià de Lòria        EU          AD
## 6 302816 AD-07         07      Andorra la Vella        EU          AD
##                                       wikipedia_link keywords
## 1               http://en.wikipedia.org/wiki/Canillo         
## 2                http://en.wikipedia.org/wiki/Encamp         
## 3            http://en.wikipedia.org/wiki/La_Massana         
## 4                http://en.wikipedia.org/wiki/Ordino         
## 5 http://en.wikipedia.org/wiki/Sant_Julià_de_Lòria         
## 6      http://en.wikipedia.org/wiki/Andorra_la_Vella
#head(runways_df)
head(Popult_df)
##   SUMLEV REGION DIVISION STATE             NAME ESTIMATESBASE2010
## 1     10      0        0     0    United States         308758105
## 2     20      1        0     0 Northeast Region          55318353
## 3     20      2        0     0   Midwest Region          66929825
## 4     20      3        0     0     South Region         114563005
## 5     20      4        0     0      West Region          71946922
## 6     40      3        6     1          Alabama           4780131
##   POPESTIMATE2010 POPESTIMATE2011 POPESTIMATE2012 POPESTIMATE2013
## 1       309348193       311663358       313998379       316204908
## 2        55388056        55632766        55829059        55988771
## 3        66978602        67153331        67332320        67543948
## 4       114863114       116061801       117299171       118424320
## 5        72118421        72815460        73537829        74247869
## 6         4785492         4799918         4815960         4829479
##   POPESTIMATE2014 POPESTIMATE2015 POPESTIMATE2016 NPOPCHG_2010
## 1       318563456       320896618       323127513       590088
## 2        56116791        56184737        56209510        69703
## 3        67726368        67838387        67941429        48777
## 4       119696311       121039206       122319574       300109
## 5        75023986        75834288        76657000       171499
## 6         4843214         4853875         4863300         5361
##   NPOPCHG_2011 NPOPCHG_2012 NPOPCHG_2013 NPOPCHG_2014 NPOPCHG_2015
## 1      2315165      2335021      2206529      2358548      2333162
## 2       244710       196293       159712       128020        67946
## 3       174729       178989       211628       182420       112019
## 4      1198687      1237370      1125149      1271991      1342895
## 5       697039       722369       710040       776117       810302
## 6        14426        16042        13519        13735        10661
##   NPOPCHG_2016 PPOPCHG_2010 PPOPCHG_2011 PPOPCHG_2012 PPOPCHG_2013
## 1      2230895   0.19111660    0.7484010    0.7492126    0.7027199
## 2        24773   0.12600339    0.4418101    0.3528370    0.2860732
## 3       103042   0.07287782    0.2608729    0.2665378    0.3143037
## 4      1280368   0.26195978    1.0435787    1.0661303    0.9592131
## 5       822712   0.23836878    0.9665201    0.9920544    0.9655439
## 6         9425   0.11215174    0.3014528    0.3342140    0.2807125
##   PPOPCHG_2014 PPOPCHG_2015 PPOPCHG_2016 NRANK_ESTBASE2010
## 1    0.7458923    0.7324010   0.69520677                 X
## 2    0.2286530    0.1210796   0.04409205                 4
## 3    0.2700760    0.1653994   0.15189335                 3
## 4    1.0740961    1.1219185   1.05781262                 1
## 5    1.0453054    1.0800573   1.08488129                 2
## 6    0.2843992    0.2201224   0.19417476                23
##   NRANK_POPEST2010 NRANK_POPEST2011 NRANK_POPEST2012 NRANK_POPEST2013
## 1                X                X                X                X
## 2                4                4                4                4
## 3                3                3                3                3
## 4                1                1                1                1
## 5                2                2                2                2
## 6               23               23               23               23
##   NRANK_POPEST2014 NRANK_POPEST2015 NRANK_POPEST2016 NRANK_NPCHG2010
## 1                X                X                X               X
## 2                4                4                4               3
## 3                3                3                3               4
## 4                1                1                1               1
## 5                2                2                2               2
## 6               23               24               24              30
##   NRANK_NPCHG2011 NRANK_NPCHG2012 NRANK_NPCHG2013 NRANK_NPCHG2014
## 1               X               X               X               X
## 2               3               3               4               4
## 3               4               4               3               3
## 4               1               1               1               1
## 5               2               2               2               2
## 6              31              26              32              30
##   NRANK_NPCHG2015 NRANK_NPCHG2016 NRANK_PPCHG2010 NRANK_PPCHG2011
## 1               X               X               X               X
## 2               4               4               3               3
## 3               3               3               4               4
## 4               1               1               1               1
## 5               2               2               2               2
## 6              31              31              38              39
##   NRANK_PPCHG2012 NRANK_PPCHG2013 NRANK_PPCHG2014 NRANK_PPCHG2015
## 1               X               X               X               X
## 2               3               4               4               4
## 3               4               3               3               3
## 4               1               2               1               1
## 5               2               1               2               2
## 6              36              37              34              35
##   NRANK_PPCHG2016
## 1               X
## 2               4
## 3               3
## 4               2
## 5               1
## 6              34

Staging and Filtering data frames

#distinct_df = airports_df %>% distinct(type)
#distinct_df
usairports_df = airports_df %>% filter(iso_country == "US")
head(usairports_df)
##       id ident          type                               name
## 1   6523   00A      heliport                  Total Rf Heliport
## 2   6524  00AK small_airport                       Lowell Field
## 3   6525  00AL small_airport                       Epps Airpark
## 4   6526  00AR        closed Newport Hospital & Clinic Heliport
## 5 322127  00AS small_airport                     Fulton Airport
## 6   6527  00AZ small_airport                     Cordes Airport
##   latitude_deg longitude_deg elevation_ft continent iso_country iso_region
## 1      40.0708     -74.93360           11      <NA>          US      US-PA
## 2      59.9492    -151.69600          450      <NA>          US      US-AK
## 3      34.8648     -86.77030          820      <NA>          US      US-AL
## 4      35.6087     -91.25490          237      <NA>          US      US-AR
## 5      34.9428     -97.81802         1100      <NA>          US      US-OK
## 6      34.3056    -112.16500         3810      <NA>          US      US-AZ
##   municipality scheduled_service gps_code iata_code local_code home_link
## 1     Bensalem                no      00A                  00A          
## 2 Anchor Point                no     00AK                 00AK          
## 3      Harvest                no     00AL                 00AL          
## 4      Newport                no                                        
## 5         Alex                no     00AS                 00AS          
## 6       Cordes                no     00AZ                 00AZ          
##   wikipedia_link keywords
## 1                        
## 2                        
## 3                        
## 4                    00AR
## 5                        
## 6
usaregions_df = regions_df%>% filter(iso_country == "US")
head(usaregions_df)
##       id  code local_code       name continent iso_country
## 1 306076 US-AK         AK     Alaska      <NA>          US
## 2 306077 US-AL         AL    Alabama      <NA>          US
## 3 306078 US-AR         AR   Arkansas      <NA>          US
## 4 306079 US-AZ         AZ    Arizona      <NA>          US
## 5 306080 US-CA         CA California      <NA>          US
## 6 306081 US-CO         CO   Colorado      <NA>          US
##                            wikipedia_link keywords
## 1     http://en.wikipedia.org/wiki/Alaska         
## 2    http://en.wikipedia.org/wiki/Alabama         
## 3   http://en.wikipedia.org/wiki/Arkansas         
## 4    http://en.wikipedia.org/wiki/Arizona         
## 5 http://en.wikipedia.org/wiki/California         
## 6   http://en.wikipedia.org/wiki/Colorado
usaPopult_df = Popult_df%>% filter(STATE != 0)
head(usaPopult_df)
##   SUMLEV REGION DIVISION STATE       NAME ESTIMATESBASE2010
## 1     40      3        6     1    Alabama           4780131
## 2     40      4        9     2     Alaska            710249
## 3     40      4        8     4    Arizona           6392301
## 4     40      3        7     5   Arkansas           2916025
## 5     40      4        9     6 California          37254522
## 6     40      4        8     8   Colorado           5029324
##   POPESTIMATE2010 POPESTIMATE2011 POPESTIMATE2012 POPESTIMATE2013
## 1         4785492         4799918         4815960         4829479
## 2          714031          722713          731089          736879
## 3         6408312         6467163         6549634         6624617
## 4         2921995         2939493         2950685         2958663
## 5        37332685        37676861        38011074        38335203
## 6         5048644         5118360         5189867         5267603
##   POPESTIMATE2014 POPESTIMATE2015 POPESTIMATE2016 NPOPCHG_2010
## 1         4843214         4853875         4863300         5361
## 2          736705          737709          741894         3782
## 3         6719993         6817565         6931071        16011
## 4         2966912         2977853         2988248         5970
## 5        38680810        38993940        39250017        78163
## 6         5349648         5448819         5540545        19320
##   NPOPCHG_2011 NPOPCHG_2012 NPOPCHG_2013 NPOPCHG_2014 NPOPCHG_2015
## 1        14426        16042        13519        13735        10661
## 2         8682         8376         5790         -174         1004
## 3        58851        82471        74983        95376        97572
## 4        17498        11192         7978         8249        10941
## 5       344176       334213       324129       345607       313130
## 6        69716        71507        77736        82045        99171
##   NPOPCHG_2016 PPOPCHG_2010 PPOPCHG_2011 PPOPCHG_2012 PPOPCHG_2013
## 1         9425    0.1121517    0.3014528    0.3342140    0.2807125
## 2         4185    0.5324893    1.2159136    1.1589663    0.7919692
## 3       113506    0.2504732    0.9183542    1.2752269    1.1448426
## 4        10395    0.2047308    0.5988374    0.3807459    0.2703779
## 5       256077    0.2098081    0.9219160    0.8870511    0.8527226
## 6        91726    0.3841471    1.3808856    1.3970686    1.4978419
##   PPOPCHG_2014 PPOPCHG_2015 PPOPCHG_2016 NRANK_ESTBASE2010
## 1    0.2843992    0.2201224    0.1941748                23
## 2   -0.0236131    0.1362825    0.5672969                47
## 3    1.4397210    1.4519658    1.6649053                16
## 4    0.2788084    0.3687673    0.3490770                32
## 5    0.9015395    0.8095229    0.6567097                 1
## 6    1.5575395    1.8537855    1.6834107                22
##   NRANK_POPEST2010 NRANK_POPEST2011 NRANK_POPEST2012 NRANK_POPEST2013
## 1               23               23               23               23
## 2               47               47               47               47
## 3               16               16               15               15
## 4               32               32               32               32
## 5                1                1                1                1
## 6               22               22               22               22
##   NRANK_POPEST2014 NRANK_POPEST2015 NRANK_POPEST2016 NRANK_NPCHG2010
## 1               23               24               24              30
## 2               48               48               48              33
## 3               15               14               14              11
## 4               32               33               33              26
## 5                1                1                1               2
## 6               22               22               21               8
##   NRANK_NPCHG2011 NRANK_NPCHG2012 NRANK_NPCHG2013 NRANK_NPCHG2014
## 1              31              26              32              30
## 2              39              41              43              47
## 3              10               7               6               5
## 4              26              34              40              38
## 5               2               2               2               2
## 6               9              10               5               8
##   NRANK_NPCHG2015 NRANK_NPCHG2016 NRANK_PPCHG2010 NRANK_PPCHG2011
## 1              31              31              38              39
## 2              44              37               2               7
## 3               8               5              16              16
## 4              29              30              25              30
## 5               3               3              23              15
## 6               7               8               5               5
##   NRANK_PPCHG2012 NRANK_PPCHG2013 NRANK_PPCHG2014 NRANK_PPCHG2015
## 1              36              37              34              35
## 2              11              21              46              38
## 3               8               9               7               9
## 4              32              38              35              30
## 5              19              20              17              17
## 6               6               5               5               2
##   NRANK_PPCHG2016
## 1              34
## 2              21
## 3               8
## 4              28
## 5              20
## 6               7

Joining two data frames Airports and Regions to get the State Names

df1<-inner_join(usairports_df,usaregions_df, by = c("iso_region"="code"))
## Warning in inner_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining
## factors with different levels, coercing to character vector
allairports_df <- arrange(select(df1,Region=iso_region,Municipality = municipality,State=name.y,AirPortName=name.x,AirPortType=type),(State))
head(allairports_df)
##   Region Municipality        State              AirPortName   AirPortType
## 1 US-U-A              (unassigned)                      atl large_airport
## 2 US-U-A       Austin (unassigned)              Erase Me 16        closed
## 3 US-U-A   Greenfield (unassigned)              Ronnie Cole small_airport
## 4 US-U-A              (unassigned)              Erase Me 14        closed
## 5 US-U-A        Baker (unassigned) Beacon Station Air Strip        closed
## 6  US-AL      Harvest      Alabama             Epps Airpark small_airport

Calculating Airport Counts by State and Airport Types in to two different Data Frames

allairports_types_bystat_df <-  allairports_df %>%  
                                group_by(State,AirPortType) %>% 
                                summarise(airportcount = n())%>% 
                                arrange%>% 
                                filter(State != '(unassigned)')
head(allairports_types_bystat_df)
## Source: local data frame [6 x 3]
## Groups: State [1]
## 
##     State    AirPortType airportcount
##    <fctr>         <fctr>        <int>
## 1 Alabama         closed            5
## 2 Alabama       heliport          103
## 3 Alabama  large_airport            4
## 4 Alabama medium_airport           11
## 5 Alabama  seaplane_base            7
## 6 Alabama  small_airport          183
allairports_bystat_df <-  allairports_df %>% 
                          group_by(State) %>% 
                          summarise(airportcount = n())%>% 
                          arrange %>% 
                          filter(State != '(unassigned)')
head(allairports_bystat_df)
## # A tibble: 6 × 2
##        State airportcount
##       <fctr>        <int>
## 1    Alabama          313
## 2     Alaska          796
## 3    Arizona          349
## 4   Arkansas          385
## 5 California         1066
## 6   Colorado          500

Consider only the population in 2016

allpopu_bystat_df <- arrange(select(usaPopult_df,Region=NAME,Population = POPESTIMATE2016) ,(Region))
head(allpopu_bystat_df)
##       Region Population
## 1    Alabama    4863300
## 2     Alaska     741894
## 3    Arizona    6931071
## 4   Arkansas    2988248
## 5 California   39250017
## 6   Colorado    5540545

Join Airport Data Fram with the Population Data Frame

allairports_allpopu_bystat_df<-left_join(allairports_bystat_df,allpopu_bystat_df, by = c("State"="Region"))
## Warning in left_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining
## factors with different levels, coercing to character vector
allairports_allpopu_bystat_df <-allairports_allpopu_bystat_df%>% 
                               select(State,airportcount,Population) %>% 
                               mutate(Popu_10k = Population / 10000) 

Staging and Filtering data frames

 df3 <- left_join(allairports_types_bystat_df,allpopu_bystat_df, by = c("State"="Region"))
## Warning in left_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining
## factors with different levels, coercing to character vector
 # allairports_types_allpopu_bystat_df <- select(df3,State,AirPortType,airportcount,Population,avgpop_airport = (Population/ airportcount))  
 # allairports_types_allpopu_bystat_df  
allairports_types_allpopu_bystat_df<- df3 %>% 
                                      select(State,AirPortType,airportcount,Population) %>% 
                                      mutate(avgpop_airport = Population / airportcount) 
summary(allairports_types_allpopu_bystat_df )
##     State                   AirPortType  airportcount    
##  Length:297         balloonport   :11   Min.   :   1.00  
##  Class :character   closed        :50   1st Qu.:   4.00  
##  Mode  :character   heliport      :51   Median :  12.00  
##                     large_airport :46   Mean   :  73.82  
##                     medium_airport:50   3rd Qu.:  73.00  
##                     seaplane_base :39   Max.   :1516.00  
##                     small_airport :50                    
##    Population       avgpop_airport    
##  Min.   :  585501   Min.   :    1520  
##  1st Qu.: 1831102   1st Qu.:   50530  
##  Median : 4863300   Median :  403361  
##  Mean   : 6585385   Mean   : 1031866  
##  3rd Qu.: 8411808   3rd Qu.: 1056426  
##  Max.   :39250017   Max.   :19745289  
## 
summary(allairports_allpopu_bystat_df)
##     State            airportcount      Population          Popu_10k      
##  Length:51          Min.   :  21.0   Min.   :  585501   Min.   :  58.55  
##  Class :character   1st Qu.: 194.5   1st Qu.: 1757121   1st Qu.: 175.71  
##  Mode  :character   Median : 348.0   Median : 4436974   Median : 443.70  
##                     Mean   : 429.9   Mean   : 6335834   Mean   : 633.58  
##                     3rd Qu.: 553.0   3rd Qu.: 7109536   3rd Qu.: 710.95  
##                     Max.   :2184.0   Max.   :39250017   Max.   :3925.00
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.5
allairports_allpopu_bystat_df.lm1  <- lm(airportcount ~ Popu_10k, data = allairports_allpopu_bystat_df)
plot(allairports_allpopu_bystat_df$airportcount ~ allairports_allpopu_bystat_df$Popu_10k, main = "Relationship between State Population vs Airports",xlab='Population in States(10Ks)',ylab='Airport Count')
abline(allairports_allpopu_bystat_df.lm1 )
abline(h=429,col = "red") 
abline(v=633,col = "blue" )

Conclusion….

The above graph suggests that Airport Count increases linearly with Population in States(10Ks) so I will fit a simple linear regression model to the data model with the mean of Airport Count and Population in States(10Ks).

summary(allairports_allpopu_bystat_df.lm1)
## 
## Call:
## lm(formula = airportcount ~ Popu_10k, data = allairports_allpopu_bystat_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -610.92 -122.78  -45.81  115.49  938.51 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 189.87966   42.11856   4.508 4.08e-05 ***
## Popu_10k      0.37886    0.04401   8.608 2.28e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 225.4 on 49 degrees of freedom
## Multiple R-squared:  0.602,  Adjusted R-squared:  0.5938 
## F-statistic: 74.11 on 1 and 49 DF,  p-value: 2.277e-11
library(ggplot2)
ggplot(allairports_allpopu_bystat_df, aes(fill=airportcount, y=airportcount, x=State)) + ggtitle("Total Airport Count in each State ") +   geom_bar( stat="identity") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + geom_point() 

ggplot(allairports_types_allpopu_bystat_df, aes(fill=AirPortType, y=airportcount, x=State)) + ggtitle("Airport Type Count in each State ") + geom_bar( stat="identity") + theme(axis.text.x = element_text(angle = 70, hjust = 1))

datn1 <- allpopu_bystat_df %>% mutate(Popu_10k = Population / 10000) %>% arrange(Popu_10k)
ggplot(data=datn1, aes(y=Popu_10k,x=Region,  colour=Region)) + theme(axis.text.x = element_text(angle = 90, hjust = 1), legend.position="none") + 
 geom_point() +
   geom_bar(stat="identity") +
ggtitle("Population in each State (10k )")