project_datavisualization_2

Author

Dinah Marion Abeja

library(readr)
library(lubridate)

Attaching package: 'lubridate'
The following objects are masked from 'package:base':

    date, intersect, setdiff, union
library(tidyverse)
── Attaching packages
───────────────────────────────────────
tidyverse 1.3.2 ──
✔ ggplot2 3.4.0     ✔ dplyr   1.1.0
✔ tibble  3.1.8     ✔ stringr 1.5.0
✔ tidyr   1.2.1     ✔ forcats 0.5.2
✔ purrr   1.0.1     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ lubridate::as.difftime() masks base::as.difftime()
✖ lubridate::date()        masks base::date()
✖ dplyr::filter()          masks stats::filter()
✖ lubridate::intersect()   masks base::intersect()
✖ dplyr::lag()             masks stats::lag()
✖ lubridate::setdiff()     masks base::setdiff()
✖ lubridate::union()       masks base::union()
library(zoo)

Attaching package: 'zoo'

The following objects are masked from 'package:base':

    as.Date, as.Date.numeric
library(plotly)

Attaching package: 'plotly'

The following object is masked from 'package:ggplot2':

    last_plot

The following object is masked from 'package:stats':

    filter

The following object is masked from 'package:graphics':

    layout
library(dplyr)
library(ggsci)
library(patchwork)
library(gridExtra)

Attaching package: 'gridExtra'

The following object is masked from 'package:dplyr':

    combine
library(performance)
library(rsample)
library(maps)

Attaching package: 'maps'

The following object is masked from 'package:purrr':

    map
library(sjPlot)
library(sjmisc)

Attaching package: 'sjmisc'

The following object is masked from 'package:purrr':

    is_empty

The following object is masked from 'package:tidyr':

    replace_na

The following object is masked from 'package:tibble':

    add_case
library(sjlabelled)

Attaching package: 'sjlabelled'

The following object is masked from 'package:forcats':

    as_factor

The following object is masked from 'package:dplyr':

    as_label

The following object is masked from 'package:ggplot2':

    as_label
sparrow_m<-read_csv("birds 11.38.48 AM.csv")
New names:
Rows: 6092602 Columns: 9
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(1): species dbl (7): ...1, day, month, year, decimalLatitude,
decimalLongitude, count dttm (1): eventDate
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `` -> `...1`
head(sparrow_m)
# A tibble: 6 × 9
   ...1 species        day month  year decim…¹ decim…² eventDate           count
  <dbl> <chr>        <dbl> <dbl> <dbl>   <dbl>   <dbl> <dttm>              <dbl>
1     1 Spizella pu…     1     8  2012    41.1   -81.4 2012-08-01 00:00:00     1
2     2 Spizella pu…    11     5  2020    41.5   -72.3 2020-05-11 08:51:00     1
3     3 Spizella pu…    21     2  2022    30.3   -88.7 2022-02-21 10:08:00     1
4     4 Spizella pu…    13     7  2019    40.3   -76.2 2019-07-13 00:00:00     1
5     5 Spizella pu…     4     5  2020    39.9   -86.1 2020-05-04 09:53:08     1
6     6 Spizella pu…    20     7  2017    36.3   -81.7 2017-07-20 00:00:00     1
# … with abbreviated variable names ¹​decimalLatitude, ²​decimalLongitude
sparrows_d <- sparrow_m 
colnames(sparrows_d)[5] = "time_year"
head(sparrows_d)
# A tibble: 6 × 9
   ...1 species      day month time_…¹ decim…² decim…³ eventDate           count
  <dbl> <chr>      <dbl> <dbl>   <dbl>   <dbl>   <dbl> <dttm>              <dbl>
1     1 Spizella …     1     8    2012    41.1   -81.4 2012-08-01 00:00:00     1
2     2 Spizella …    11     5    2020    41.5   -72.3 2020-05-11 08:51:00     1
3     3 Spizella …    21     2    2022    30.3   -88.7 2022-02-21 10:08:00     1
4     4 Spizella …    13     7    2019    40.3   -76.2 2019-07-13 00:00:00     1
5     5 Spizella …     4     5    2020    39.9   -86.1 2020-05-04 09:53:08     1
6     6 Spizella …    20     7    2017    36.3   -81.7 2017-07-20 00:00:00     1
# … with abbreviated variable names ¹​time_year, ²​decimalLatitude,
#   ³​decimalLongitude
x_min<- min(sparrow_m[,6])
x_max<- max(sparrow_m[,6])
x_min
[1] 24.53307
x_max
[1] 48.14776
#counts of species by year and species

sparrows_m5 <- sparrows_d %>% 
  group_by(time_year,species) %>% 
  dplyr::summarize(sum_count=sum(count),
            .groups = 'drop') %>%
  as.data.frame()

head(sparrows_m5)
  time_year           species sum_count
1      1900 Passer domesticus        84
2      1900  Spizella pusilla        98
3      1901 Passer domesticus        16
4      1901  Spizella pusilla        74
5      1902 Passer domesticus        13
6      1902  Spizella pusilla       108
y_min<- min(sparrows_m5[,3])
y_max<- max(sparrows_m5[,3])
y_min
[1] 3
y_max
[1] 5479090
sparrows_passer <- sparrows_d %>%
  group_by(time_year,species) %>% 
filter(species =="Passer domesticus", ) %>%
  dplyr::summarize(sum_count=sum(count),mean_count = mean(count), sd_count = sd(count),n=n(),se= sd_count/sqrt(n)) %>%
  as.data.frame()
`summarise()` has grouped output by 'time_year'. You can override using the
`.groups` argument.
head(sparrows_passer)
  time_year           species sum_count mean_count  sd_count  n        se
1      1900 Passer domesticus        84   1.333333 2.4029552 63 0.3027439
2      1901 Passer domesticus        16   1.333333 1.1547005 12 0.3333333
3      1902 Passer domesticus        13   1.181818 0.6030227 11 0.1818182
4      1903 Passer domesticus        28   1.333333 1.0645813 21 0.2323107
5      1904 Passer domesticus        22   1.157895 0.6882472 19 0.1578947
6      1905 Passer domesticus        41   1.464286 1.2904820 28 0.2438782
sparrows_passer2 <- sparrows_d %>%
  group_by(time_year,species) %>% 
filter(species =="Passer domesticus",between (time_year,2000,2020)) %>%
  dplyr::summarize(sum_count=sum(count),mean_count = mean(count), sd_count = sd(count),n=n(),se= sd_count/sqrt(n),
            .groups = "keep") %>%
  as.data.frame()
head(sparrows_passer2)
  time_year           species sum_count mean_count sd_count     n         se
1      2000 Passer domesticus     58540   2.670377 27.75584 21922 0.18746246
2      2001 Passer domesticus     51747   2.887829 11.15980 17919 0.08336804
3      2002 Passer domesticus     79041   3.896332 12.31492 20286 0.08646361
4      2003 Passer domesticus    115396   4.427070 13.32246 26066 0.08251771
5      2004 Passer domesticus    143474   5.129567 14.42218 27970 0.08623524
6      2005 Passer domesticus    160639   4.920332 14.25601 32648 0.07889868
sparrows_spizella <- sparrows_d %>%
  group_by(time_year,species) %>% 
filter(species =="Spizella pusilla", ) %>%
  dplyr::summarize(sum_count=sum(count),mean_count = mean(count), sd_count = sd(count),n=n(),se= sd_count/sqrt(n)) %>%
  as.data.frame()
`summarise()` has grouped output by 'time_year'. You can override using the
`.groups` argument.
head(sparrows_spizella)
  time_year          species sum_count mean_count  sd_count   n         se
1      1900 Spizella pusilla        98   1.180723 0.7181375  83 0.07882583
2      1901 Spizella pusilla        74   1.042254 0.2638517  71 0.03131343
3      1902 Spizella pusilla       108   1.080000 0.4644971 100 0.04644971
4      1903 Spizella pusilla        97   1.000000 0.0000000  97 0.00000000
5      1904 Spizella pusilla       149   1.103704 0.5363513 135 0.04616177
6      1905 Spizella pusilla       150   1.027397 0.2611699 146 0.02161458
sparrows_spizella2 <- sparrows_d %>%
  group_by(time_year,species) %>% 
filter(species =="Spizella pusilla",between (time_year,2000,2020)) %>%
  dplyr::summarize(sum_count=sum(count),mean_count = mean(count), sd_count = sd(count),n=n(),se= sd_count/sqrt(n)) %>%
  as.data.frame()
`summarise()` has grouped output by 'time_year'. You can override using the
`.groups` argument.
head(sparrows_spizella2)
  time_year          species sum_count mean_count sd_count     n         se
1      2000 Spizella pusilla     11633   2.328929 5.624631  4995 0.07958410
2      2001 Spizella pusilla     11640   2.394569 4.097807  4861 0.05877446
3      2002 Spizella pusilla     14655   2.515880 3.715426  5825 0.04868113
4      2003 Spizella pusilla     17873   2.535897 3.746526  7048 0.04462681
5      2004 Spizella pusilla     21027   2.623783 4.943594  8014 0.05522277
6      2005 Spizella pusilla     25759   2.458858 4.296097 10476 0.04197361
p0 <- ggplot(data = sparrows_passer,aes(y = mean_count, x =time_year)) +
  geom_line()+
  labs(title = "Distribution of Passer domesticus over time", x= "time", y ="mean count")+
  theme_classic()

p1 <- ggplot(data = sparrows_passer2,aes(y = mean_count, x =time_year)) +
  geom_line()+
  labs(title = "Distribution of Passer domesticus from 2000-2020", x= "time", y ="mean_count")+
  theme_bw()
p0+p1

p2 <- ggplot(data = sparrows_spizella,aes(y = mean_count, x =time_year)) +
  geom_line()+
  labs(title = "Distribution of Spizella pusilla over time", x= "time", y ="count")+
  theme_classic()

p3 <- ggplot(data = sparrows_spizella2,aes(y = mean_count, x =time_year)) +
  geom_line()+
  labs(title = "Distribution of Spizella pusilla from 2000-2020", x= "time", y ="count")+
  theme_classic()
p2+p3

The distribution of bird species generally increased from 1900 - 1955. The same can be seen in the early 2005-2012 (very clear on the second plot). I think this would be an interesting trend to investigate / hypothesize later on.

all_birds <- rbind(sparrows_passer2,sparrows_spizella2)
head(all_birds)
  time_year           species sum_count mean_count sd_count     n         se
1      2000 Passer domesticus     58540   2.670377 27.75584 21922 0.18746246
2      2001 Passer domesticus     51747   2.887829 11.15980 17919 0.08336804
3      2002 Passer domesticus     79041   3.896332 12.31492 20286 0.08646361
4      2003 Passer domesticus    115396   4.427070 13.32246 26066 0.08251771
5      2004 Passer domesticus    143474   5.129567 14.42218 27970 0.08623524
6      2005 Passer domesticus    160639   4.920332 14.25601 32648 0.07889868
p5 <- ggplot(data = all_birds,aes(y = sum_count, x = time_year,  color = species)) +
  geom_point()+
  geom_line()+
  labs(title = "Distribution of Sparrows from 2000-2020", x= "species", y ="Total # birds")+
  theme_bw()+
  theme(axis.text.x=element_text(angle = 0, size = 15, vjust = 1), axis.title.y = element_text(size = 16),axis.title.x = element_text(size = 16))+
  theme_classic()
p5+geom_errorbar(aes(ymin=mean_count-se, ymax=mean_count+se), width=.2,
                 position=position_dodge(.9)) 

The number of sparrows tracked generally increased for the two decades between 2000-2020. The invasive species (Passer domesticus) was greater than the Spizella pusilla which makes sense. But this is not also a very sufficient graph. I am not really sure about the error bars being similar for both species

sparrows_d$lat <- ifelse(sparrows_d$decimalLatitude >10 & sparrows_d$decimalLatitude <20,"10-20",
                         ifelse(sparrows_d$decimalLatitude > 20 & sparrows_d$decimalLatitude< 30, "20-30",
                                ifelse(sparrows_d$decimalLatitude >30 &sparrows_d$decimalLatitude <40,"30-40",
                                ifelse(sparrows_d$decimalLatitude >40 &sparrows_d$decimalLatitude <50,"40-50",
                                       ifelse(sparrows_d$decimalLatitude >50 &sparrows_d$decimalLatitude< 60,"other",NA)))))
head(sparrows_d)
# A tibble: 6 × 10
   ...1 species      day month time_…¹ decim…² decim…³ eventDate           count
  <dbl> <chr>      <dbl> <dbl>   <dbl>   <dbl>   <dbl> <dttm>              <dbl>
1     1 Spizella …     1     8    2012    41.1   -81.4 2012-08-01 00:00:00     1
2     2 Spizella …    11     5    2020    41.5   -72.3 2020-05-11 08:51:00     1
3     3 Spizella …    21     2    2022    30.3   -88.7 2022-02-21 10:08:00     1
4     4 Spizella …    13     7    2019    40.3   -76.2 2019-07-13 00:00:00     1
5     5 Spizella …     4     5    2020    39.9   -86.1 2020-05-04 09:53:08     1
6     6 Spizella …    20     7    2017    36.3   -81.7 2017-07-20 00:00:00     1
# … with 1 more variable: lat <chr>, and abbreviated variable names ¹​time_year,
#   ²​decimalLatitude, ³​decimalLongitude
sparrows_m6 <- sparrows_d %>% group_by(time_year,lat,species) %>% 
  dplyr::summarize(sum_count1=sum(count),
            .groups = 'drop') %>%
  as.data.frame()
head(sparrows_m6)
  time_year   lat           species sum_count1
1      1900 30-40 Passer domesticus          7
2      1900 30-40  Spizella pusilla         16
3      1900 40-50 Passer domesticus         77
4      1900 40-50  Spizella pusilla         82
5      1901 30-40 Passer domesticus          3
6      1901 30-40  Spizella pusilla         14
p6 <- ggplot(data = sparrows_m6,aes(y = sum_count1, x = lat,  color = species)) +
  geom_point()+
  labs(title = "Distribution of Sparrows in space", x= "Latitude range", y ="Total # birds")+
  theme_bw()+
  theme(axis.text.x=element_text(angle = 0, size = 15, vjust = 1), axis.title.y = element_text(size = 16),axis.title.x = element_text(size = 16))+
  theme_classic()

p7 <- ggplot(data = sparrows_m6, mapping = aes(x = lat, color = species))+ 
  geom_density()+
  labs(title = "Distribution of Sparrows in space", x= "Latitude range")+
  theme_classic()
grid.arrange(p6,p7, ncol =1)

The distribution of Passer domesticus is generally higher than that of Spizella pusilla for all latitude ranges (both plots. The first graph however seems to be show this better than the histogram.

this would make sense since we hypothesized that the Passer domesticus is the invasive species and thus would overtime out-compete the Spizella pusilla - but am not sure the counts used are meaningful

DATA VISUALIZATION HYPOTHESIS 2

#counts of species by year and species
#then abundance by ratios

sparrows_m5_2 <- sparrows_d %>%
  filter(time_year > 1999, time_year<2021) %>%
  group_by(time_year,species) %>% 
  dplyr::summarize(sum_count=sum(count),
            .groups = 'drop') %>%
  as.data.frame()

head(sparrows_m5_2)
  time_year           species sum_count
1      2000 Passer domesticus     58540
2      2000  Spizella pusilla     11633
3      2001 Passer domesticus     51747
4      2001  Spizella pusilla     11640
5      2002 Passer domesticus     79041
6      2002  Spizella pusilla     14655
p5 <- ggplot(data = sparrows_m5_2,aes(y = sum_count, x = time_year,  color = species)) +
  geom_point()+
  geom_line()+
  labs(title = "Distribution of Sparrows from 2000-2020", x= "species", y ="Abundance of birds")+
  theme_bw()+
  theme(axis.text.x=element_text(angle = 0, size = 15, vjust = 1), axis.title.y = element_text(size = 16),axis.title.x = element_text(size = 16))+
  theme_classic()
p5

# transforming abundance 
sparrows_m5_2$log_counts <-log10(sparrows_m5_2$sum_count)
head(sparrows_m5_2)
  time_year           species sum_count log_counts
1      2000 Passer domesticus     58540   4.767453
2      2000  Spizella pusilla     11633   4.065692
3      2001 Passer domesticus     51747   4.713885
4      2001  Spizella pusilla     11640   4.065953
5      2002 Passer domesticus     79041   4.897852
6      2002  Spizella pusilla     14655   4.165986
table_1 <- knitr::kable(head(sparrows_m5_2))
table_1
time_year species sum_count log_counts
2000 Passer domesticus 58540 4.767453
2000 Spizella pusilla 11633 4.065692
2001 Passer domesticus 51747 4.713885
2001 Spizella pusilla 11640 4.065953
2002 Passer domesticus 79041 4.897852
2002 Spizella pusilla 14655 4.165986
p5 <- ggplot(data = sparrows_m5_2,aes(y = log_counts, x = time_year,  color = species)) +
  geom_point()+
  geom_line()+
  labs(title = "Distribution of Sparrows from 2000-2021", x= "species", y ="log_abundance of birds")+
  theme_bw()+
  theme(axis.text.x=element_text(angle = 0, size = 15, vjust = 1), axis.title.y = element_text(size = 16),axis.title.x = element_text(size = 16))+
  theme_classic()
p5

There is definitely difference in relative log abundance of sparrows for the two species. Passer domesticus were higher than Spizella pusilla over the years between 2000 to 2021

LINEAR MODELS FOR HYPOTHESIS 2

lm_sparrows <- lm(log_counts~species*time_year, data=sparrows_m5_2)
summary(lm_sparrows)

Call:
lm(formula = log_counts ~ species * time_year, data = sparrows_m5_2)

Residuals:
      Min        1Q    Median        3Q       Max 
-0.152049 -0.038272 -0.000914  0.045682  0.114475 

Coefficients:
                                    Estimate Std. Error t value Pr(>|t|)    
(Intercept)                       -1.954e+02  4.570e+00 -42.759  < 2e-16 ***
speciesSpizella pusilla            1.707e+01  6.463e+00   2.642  0.01191 *  
time_year                          1.001e-01  2.274e-03  44.021  < 2e-16 ***
speciesSpizella pusilla:time_year -8.916e-03  3.215e-03  -2.773  0.00856 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.06309 on 38 degrees of freedom
Multiple R-squared:  0.9931,    Adjusted R-squared:  0.9925 
F-statistic:  1817 on 3 and 38 DF,  p-value: < 2.2e-16
anova(lm_sparrows)
Analysis of Variance Table

Response: log_counts
                  Df  Sum Sq Mean Sq   F value    Pr(>F)    
species            1  7.5789  7.5789 1904.1389 < 2.2e-16 ***
time_year          1 14.0826 14.0826 3538.1441 < 2.2e-16 ***
species:time_year  1  0.0306  0.0306    7.6901  0.008557 ** 
Residuals         38  0.1512  0.0040                        
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
tab_model(lm_sparrows,auto.label =TRUE)
  log_counts
Predictors Estimates CI p
(Intercept) -195.41 -204.66 – -186.15 <0.001
species [Spizella
pusilla]
17.07 3.99 – 30.16 0.012
time year 0.10 0.10 – 0.10 <0.001
species [Spizella
pusilla] × time year
-0.01 -0.02 – -0.00 0.009
Observations 42
R2 / R2 adjusted 0.993 / 0.993
check_model(lm_sparrows)
Variable `Component` is not in your data frame :/

lm_sparrows_1 <- lm(log_counts~species+time_year, data=sparrows_m5_2)
summary(lm_sparrows_1)

Call:
lm(formula = log_counts ~ species + time_year, data = sparrows_m5_2)

Residuals:
      Min        1Q    Median        3Q       Max 
-0.192172 -0.035905  0.006039  0.047578  0.108732 

Coefficients:
                          Estimate Std. Error t value Pr(>|t|)    
(Intercept)             -186.44414    3.49762  -53.31   <2e-16 ***
speciesSpizella pusilla   -0.84959    0.02107  -40.31   <2e-16 ***
time_year                  0.09563    0.00174   54.95   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.06829 on 39 degrees of freedom
Multiple R-squared:  0.9917,    Adjusted R-squared:  0.9912 
F-statistic:  2323 on 2 and 39 DF,  p-value: < 2.2e-16
anova(lm_sparrows_1)
Analysis of Variance Table

Response: log_counts
          Df  Sum Sq Mean Sq F value    Pr(>F)    
species    1  7.5789  7.5789  1625.3 < 2.2e-16 ***
time_year  1 14.0826 14.0826  3020.1 < 2.2e-16 ***
Residuals 39  0.1819  0.0047                      
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
check_model(lm_sparrows_1)
Variable `Component` is not in your data frame :/

tab_model(lm_sparrows_1,auto.label =TRUE)
  log_counts
Predictors Estimates CI p
(Intercept) -186.44 -193.52 – -179.37 <0.001
species [Spizella
pusilla]
-0.85 -0.89 – -0.81 <0.001
time year 0.10 0.09 – 0.10 <0.001
Observations 42
R2 / R2 adjusted 0.992 / 0.991

CONFIDENCE INTERVALS

tidy(lm_sparrows_1)
# A tibble: 3 × 5
  term                     estimate std.error statistic  p.value
  <chr>                       <dbl>     <dbl>     <dbl>    <dbl>
1 (Intercept)             -186.       3.50        -53.3 4.70e-38
2 speciesSpizella pusilla   -0.850    0.0211      -40.3 2.09e-33
3 time_year                  0.0956   0.00174      55.0 1.46e-38
set.seed(400) #any number is fine
sparrows_intervals<- reg_intervals(log_counts~species+time_year, data=sparrows_m5_2, 
                                   type='percentile',
                                   keep_reps=FALSE)

sparrows_intervals
# A tibble: 2 × 6
  term                     .lower .estimate  .upper .alpha .method   
  <chr>                     <dbl>     <dbl>   <dbl>  <dbl> <chr>     
1 speciesSpizella pusilla -0.892    -0.851  -0.810    0.05 percentile
2 time_year                0.0918    0.0957  0.0998   0.05 percentile
#plot the results
sparrows_boots<-ggplot(data=sparrows_intervals, aes(x=.estimate, y=term))+
  geom_vline(xintercept=0, linetype=2)+
  geom_errorbarh(aes(xmin=.lower, xmax=.upper),height=0.1)+
  geom_point(size=3)+
  theme_classic()
sparrows_boots

time- year has positive effect and being Spizella has negative effect

birds2 <- sparrows_d %>%
  group_by(time_year, species) %>%
  filter(time_year== 2000) %>%
  dplyr::summarise(lat=mean(decimalLatitude),
            long = mean(decimalLongitude)) %>%
  as.data.frame()
`summarise()` has grouped output by 'time_year'. You can override using the
`.groups` argument.
birds2
  time_year           species      lat      long
1      2000 Passer domesticus 40.28972 -80.25492
2      2000  Spizella pusilla 39.35673 -81.20765
us <- map_data("state")
head(us)
       long      lat group order  region subregion
1 -87.46201 30.38968     1     1 alabama      <NA>
2 -87.48493 30.37249     1     2 alabama      <NA>
3 -87.52503 30.37249     1     3 alabama      <NA>
4 -87.53076 30.33239     1     4 alabama      <NA>
5 -87.57087 30.32665     1     5 alabama      <NA>
6 -87.58806 30.32665     1     6 alabama      <NA>
fig1 <- ggplot() +
  geom_polygon(data=us, aes(x=long, y=lat, group = group), color="white", fill="lightblue") +
  labs(title="Sparrow Migrations") +
  geom_point(data = birds2, aes(x =long, y = lat, color = time_year, shape = species, alpha = 1))
print(fig1)