library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
setwd("C:/Users/wjcor/Downloads/Intro2r")
bird=read.csv("birds.csv")
https://grist.org/living/what-types-of-planes-are-least-bad-for-the-planet/

This dataset displays the bird strikes that occurred by aircrift between 1990 and 1999 in the united states and parts of canada. It includes variables like “birds_struck” which is technically a numerical variable that shows how many birds were hit by that flight, “phase_of_flt” which is a categorical variable that describes how at what point in the flight the strikes happened, “ac_mass” which is a numerical variable that indicates in tons the total mass of each aircraft that struck birds, and things like “date,” “state,” and “height.” The data itself came from open intro where it is called “birds.” In order to clean the data, I had to separate the number ranges present in the “birds_struck” variable into their largest and smallest estimates and average them out, and then I had to remove the non-numerical data from some of the values like “over” and “-.” I chose this dataset because I have always been interested in the nature-focused side of data science.

bird$date <- str_sub(bird$date, end = -9)

This removes the “time” from the date variable, which will be important later

#Create list to add to
lowball=c()
#Loop through column, converting the lowest number of number ranges to a numerical value and changing over x to just x*
for(birdcount in bird$birds_struck){ 
  if(grepl("-", birdcount) == TRUE){
    birdcount <- as.numeric(gsub("-.*","",birdcount))
    lowball=c(lowball,birdcount)
  }else if(grepl("Over", birdcount)==TRUE){
    birdcount <- as.numeric(gsub("Over ","",birdcount))
    lowball=c(lowball,birdcount)
  }else {lowball=c(lowball, birdcount)}
}
#Create new variable
bird$lowball=as.numeric(lowball)

As the “birds_struck” variable is a character variable with 4 different ways of describing how many birds were struck, we need to make this columns more usable by creating an average for the number ranges and removing unnecessary non-numerical parts of values. This chunk takes the lowest number in the range and uses that for its value

#Create list to add to
highball=c()
#Loop through column, converting the highest number of number ranges to a numerical value and changing over x to just x*
for(birdcount in bird$birds_struck){ 
  if(grepl("-", birdcount) == TRUE){
    birdcount <- as.numeric(gsub(".*-","",birdcount))
    highball=c(highball,birdcount)
  }else if(grepl("Over", birdcount)==TRUE){
    birdcount <- as.numeric(gsub("Over ","",birdcount))
    highball=c(highball,birdcount)
  }else {highball=c(highball, birdcount)}
}
#Create new variable
bird$highball=as.numeric(highball)

Similar to the prior chunk, however this one takes the highest number

#Create temporary short names for ease of use
h=bird$highball
l=bird$lowball
#Create list to add to
avgbird=c()
#Loop through column length and for each row of each column determining if there is any difference between the values or if they are NA values.
for (i in 1:length(h)){
  if(is.na(h[i])==TRUE  | is.na(l[i])==TRUE){
    avgbird=c(avgbird, 0)
    }else if((h[i]>l[i])){
    avgbird=c(avgbird, (h[i]-l[i])/2)
  }else{
    avgbird=c(avgbird, l[i])
  }
}
#Create new variable
bird$avgbird=avgbird

Finally, this chunk takes those two variables and averages them out. We will use this variable for our regression analysis.

bird$time_of_day = as.factor(bird$time_of_day)
bird$phase_of_flt = as.factor(bird$phase_of_flt)

As these variables are unusable character columns, we turn them into factors so they can contribute to our analysis

#Create list to add to
lowballseen=c()
#Loop through column, converting the lowest number of number ranges to a numerical value and changing over x to just x*
for(birdcount in bird$birds_seen){ 
  if(grepl("-", birdcount) == TRUE){
    birdcount <- as.numeric(gsub("-.*","",birdcount))
    lowballseen=c(lowballseen,birdcount)
  }else if(grepl("Over", birdcount)==TRUE){
    birdcount <- as.numeric(gsub("Over ","",birdcount))
    lowballseen=c(lowballseen,birdcount)
  }else {lowballseen=c(lowballseen, birdcount)}
}
## Warning: NAs introduced by coercion
#Create new variable
bird$lowballseen=as.numeric(lowballseen)

This is the same as the aforementioned lowball chunk, however this time it is being applied to the number of birds seen by the pilot

#Create list to add to
highballseen=c()
#Loop through column, converting the highest number of number ranges to a numerical value and changing over x to just x*
for(birdcount in bird$birds_seen){ 
  if(grepl("-", birdcount) == TRUE){
    birdcount <- as.numeric(gsub(".*-","",birdcount))
    highballseen=c(highballseen,birdcount)
  }else if(grepl("Over", birdcount)==TRUE){
    birdcount <- as.numeric(gsub("Over ","",birdcount))
    highballseen=c(highballseen,birdcount)
  }else {highballseen=c(highballseen, birdcount)}
}
#Create new variable
bird$highballseen=as.numeric(highballseen)

Again similar, however with highest number taken

#Assign variables to shorter names for ease of use
hs=bird$highballseen
ls=bird$lowballseen
#Create list to add to
avgbirdseen=c()
#Loop through the length of the column, analyzing each row for NAs and differences between the highball and the lowball 
for (i in 1:length(hs)){
  if(is.na(hs[i])==TRUE  | is.na(ls[i])==TRUE){
    avgbirdseen=c(avgbirdseen, 0)
    }else if((hs[i]>ls[i])){
    #Average out any differences
      avgbirdseen=c(avgbirdseen, (hs[i]-ls[i])/2)
  }else{
    avgbirdseen=c(avgbirdseen, ls[i])
  }
}
#Create column
bird$avgbirdseen=avgbirdseen

This again averages the highball and lowball variables

#Fix the date variable, turning it from a character string to a date.
bird$date=gsub(" ", "", bird$date, fixed = TRUE)
bird$date <- as.Date(bird$date, "%m/%d/%Y")

This is why we removed the time bit from the date variable

birdfit1=lm(data=bird, avgbird ~ height + ac_mass + time_of_day + phase_of_flt  + avgbirdseen +num_engs + date)
summary(birdfit1)
## 
## Call:
## lm(formula = avgbird ~ height + ac_mass + time_of_day + phase_of_flt + 
##     avgbirdseen + num_engs + date, data = bird)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.182 -1.017 -0.757 -0.231 98.963 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               7.487e-01  5.651e-01   1.325   0.1853    
## height                   -7.971e-05  3.156e-05  -2.526   0.0116 *  
## ac_mass                   9.683e-02  5.465e-02   1.772   0.0765 .  
## time_of_dayDay            2.236e-01  2.394e-01   0.934   0.3504    
## time_of_dayDusk           3.688e-02  3.061e-01   0.120   0.9041    
## time_of_dayNight         -9.046e-02  2.538e-01  -0.356   0.7215    
## phase_of_fltClimb         3.733e-01  1.297e-01   2.878   0.0040 ** 
## phase_of_fltDescent       1.985e-01  2.870e-01   0.692   0.4891    
## phase_of_fltEn Route      6.587e-02  3.241e-01   0.203   0.8390    
## phase_of_fltLanding Roll  1.815e-01  1.307e-01   1.388   0.1651    
## phase_of_fltParked       -6.891e-01  1.663e+00  -0.414   0.6786    
## phase_of_fltTake-off run  1.623e-01  1.278e-01   1.270   0.2042    
## phase_of_fltTaxi         -5.764e-01  7.348e-01  -0.784   0.4328    
## avgbirdseen               1.391e-01  4.274e-03  32.550   <2e-16 ***
## num_engs                  1.784e-01  9.908e-02   1.800   0.0718 .  
## date                      1.302e-05  5.176e-05   0.252   0.8014    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.504 on 15295 degrees of freedom
##   (3991 observations deleted due to missingness)
## Multiple R-squared:  0.07087,    Adjusted R-squared:  0.06996 
## F-statistic: 77.78 on 15 and 15295 DF,  p-value: < 2.2e-16

From the model we can see that the only significant variables are the aircraft’s height, it’s weight, the number of engines, whether it is climbing, and the average number of birds seen by the pilots. Further analysis also revealed whether it was day as a significant variable.

#Create list to add to
daytime=c()
#Loop through column and assign 1s to daytime flights and 0 to anything else
for (time in bird$time_of_day) {
  if(grepl("Da", time) == TRUE){
    daytime=c(daytime, 1)
  }else{
    daytime=c(daytime,0)
  }
}
#Create new column
bird$daytime=daytime

This chunk creates a new column of binary that represents whether it was daytime or not in that particular flight

#Create list to add to
climb=c()
#Loop through column and assign 1s to strikes occurring during climbing and 0 to anything else
for (phase in bird$phase_of_flt) {
  if(grepl("Climb", phase) == TRUE){
    climb=c(climb, 1)
  }else{
    climb=c(climb,0)
  }
}
#Create new column
bird$climb=climb

This allows us to take only the significant aspect of phase of flight when we want to call it

birdfit2=lm(data=bird, avgbird ~ height + ac_mass + daytime + climb  + avgbirdseen +num_engs)
summary(birdfit2)
## 
## Call:
## lm(formula = avgbird ~ height + ac_mass + daytime + climb + avgbirdseen + 
##     num_engs, data = bird)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.180 -0.954 -0.722 -0.268 98.843 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  8.955e-01  1.942e-01   4.612 4.03e-06 ***
## height      -7.976e-05  2.502e-05  -3.188  0.00143 ** 
## ac_mass      1.035e-01  5.215e-02   1.985  0.04712 *  
## daytime      2.956e-01  9.626e-02   3.070  0.00214 ** 
## climb        2.627e-01  1.129e-01   2.327  0.01998 *  
## avgbirdseen  1.406e-01  4.175e-03  33.680  < 2e-16 ***
## num_engs     1.587e-01  9.579e-02   1.657  0.09759 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.455 on 15870 degrees of freedom
##   (3425 observations deleted due to missingness)
## Multiple R-squared:  0.07178,    Adjusted R-squared:  0.07143 
## F-statistic: 204.5 on 6 and 15870 DF,  p-value: < 2.2e-16

The final model reveals that it is not a very effective one. Based on the Adjusted R^2, which describes how fitting the model is to the data, and the residuals, which show an immense amount of rightward skewing. According to the R squared, only 7.143% of the variation in number of birds being hit can be explained with this model. While the p-value is incredibly small, the model is still not an effective one as a result of the issues with the R^2 and residuals

model_residuals = birdfit2$residuals

ggplot(data=birdfit2, aes(x=model_residuals))+
         geom_histogram(binwidth = 5, fill="darkblue")+
  theme("bw")+
  labs(x="Residuals", y="Frequency")

This diagnostic plot confirms our suspicions that the plot was skewed right. What this essentially means is that the majority of the data indicates a very small number of birds being hit, while there are few very large number cases.

qqnorm(model_residuals)
qqline(model_residuals)

As we can see with this QQ plot, the model for the most part does not fit the line and indicates a large number of outliers way above the line.