project_datavisualization_2

Author

Dinah Marion Abeja

library(readr)
library(lubridate)


Attaching package: 'lubridate'

The following objects are masked from 'package:base':

    date, intersect, setdiff, union

library(tidyverse)

── Attaching packages
───────────────────────────────────────
tidyverse 1.3.2 ──

✔ ggplot2 3.4.0     ✔ dplyr   1.1.0
✔ tibble  3.1.8     ✔ stringr 1.5.0
✔ tidyr   1.2.1     ✔ forcats 0.5.2
✔ purrr   1.0.1     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ lubridate::as.difftime() masks base::as.difftime()
✖ lubridate::date()        masks base::date()
✖ dplyr::filter()          masks stats::filter()
✖ lubridate::intersect()   masks base::intersect()
✖ dplyr::lag()             masks stats::lag()
✖ lubridate::setdiff()     masks base::setdiff()
✖ lubridate::union()       masks base::union()

library(zoo)


Attaching package: 'zoo'

The following objects are masked from 'package:base':

    as.Date, as.Date.numeric

library(plotly)


Attaching package: 'plotly'

The following object is masked from 'package:ggplot2':

    last_plot

The following object is masked from 'package:stats':

    filter

The following object is masked from 'package:graphics':

    layout

library(dplyr)
library(ggsci)
library(patchwork)
library(gridExtra)


Attaching package: 'gridExtra'

The following object is masked from 'package:dplyr':

    combine

library(performance)
library(rsample)
library(maps)


Attaching package: 'maps'

The following object is masked from 'package:purrr':

    map

library(sjPlot)
library(sjmisc)


Attaching package: 'sjmisc'

The following object is masked from 'package:purrr':

    is_empty

The following object is masked from 'package:tidyr':

    replace_na

The following object is masked from 'package:tibble':

    add_case

library(sjlabelled)


Attaching package: 'sjlabelled'

The following object is masked from 'package:forcats':

    as_factor

The following object is masked from 'package:dplyr':

    as_label

The following object is masked from 'package:ggplot2':

    as_label

sparrow_m<-read_csv("birds 11.38.48 AM.csv")

New names:
Rows: 6092602 Columns: 9
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(1): species dbl (7): ...1, day, month, year, decimalLatitude,
decimalLongitude, count dttm (1): eventDate
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `` -> `...1`

head(sparrow_m)

# A tibble: 6 × 9
   ...1 species        day month  year decim…¹ decim…² eventDate           count
  <dbl> <chr>        <dbl> <dbl> <dbl>   <dbl>   <dbl> <dttm>              <dbl>
1     1 Spizella pu…     1     8  2012    41.1   -81.4 2012-08-01 00:00:00     1
2     2 Spizella pu…    11     5  2020    41.5   -72.3 2020-05-11 08:51:00     1
3     3 Spizella pu…    21     2  2022    30.3   -88.7 2022-02-21 10:08:00     1
4     4 Spizella pu…    13     7  2019    40.3   -76.2 2019-07-13 00:00:00     1
5     5 Spizella pu…     4     5  2020    39.9   -86.1 2020-05-04 09:53:08     1
6     6 Spizella pu…    20     7  2017    36.3   -81.7 2017-07-20 00:00:00     1
# … with abbreviated variable names ¹decimalLatitude, ²decimalLongitude

sparrows_d <- sparrow_m 
colnames(sparrows_d)[5] = "time_year"
head(sparrows_d)

# A tibble: 6 × 9
   ...1 species      day month time_…¹ decim…² decim…³ eventDate           count
  <dbl> <chr>      <dbl> <dbl>   <dbl>   <dbl>   <dbl> <dttm>              <dbl>
1     1 Spizella …     1     8    2012    41.1   -81.4 2012-08-01 00:00:00     1
2     2 Spizella …    11     5    2020    41.5   -72.3 2020-05-11 08:51:00     1
3     3 Spizella …    21     2    2022    30.3   -88.7 2022-02-21 10:08:00     1
4     4 Spizella …    13     7    2019    40.3   -76.2 2019-07-13 00:00:00     1
5     5 Spizella …     4     5    2020    39.9   -86.1 2020-05-04 09:53:08     1
6     6 Spizella …    20     7    2017    36.3   -81.7 2017-07-20 00:00:00     1
# … with abbreviated variable names ¹time_year, ²decimalLatitude,
#   ³decimalLongitude

x_min<- min(sparrow_m[,6])
x_max<- max(sparrow_m[,6])
x_min

[1] 24.53307

x_max

[1] 48.14776

#counts of species by year and species

sparrows_m5 <- sparrows_d %>% 
  group_by(time_year,species) %>% 
  dplyr::summarize(sum_count=sum(count),
            .groups = 'drop') %>%
  as.data.frame()

head(sparrows_m5)

  time_year           species sum_count
1      1900 Passer domesticus        84
2      1900  Spizella pusilla        98
3      1901 Passer domesticus        16
4      1901  Spizella pusilla        74
5      1902 Passer domesticus        13
6      1902  Spizella pusilla       108

y_min<- min(sparrows_m5[,3])
y_max<- max(sparrows_m5[,3])
y_min

[1] 3

y_max

[1] 5479090

sparrows_passer <- sparrows_d %>%
  group_by(time_year,species) %>% 
filter(species =="Passer domesticus", ) %>%
  dplyr::summarize(sum_count=sum(count),mean_count = mean(count), sd_count = sd(count),n=n(),se= sd_count/sqrt(n)) %>%
  as.data.frame()

`summarise()` has grouped output by 'time_year'. You can override using the
`.groups` argument.

head(sparrows_passer)

  time_year           species sum_count mean_count  sd_count  n        se
1      1900 Passer domesticus        84   1.333333 2.4029552 63 0.3027439
2      1901 Passer domesticus        16   1.333333 1.1547005 12 0.3333333
3      1902 Passer domesticus        13   1.181818 0.6030227 11 0.1818182
4      1903 Passer domesticus        28   1.333333 1.0645813 21 0.2323107
5      1904 Passer domesticus        22   1.157895 0.6882472 19 0.1578947
6      1905 Passer domesticus        41   1.464286 1.2904820 28 0.2438782

sparrows_passer2 <- sparrows_d %>%
  group_by(time_year,species) %>% 
filter(species =="Passer domesticus",between (time_year,2000,2020)) %>%
  dplyr::summarize(sum_count=sum(count),mean_count = mean(count), sd_count = sd(count),n=n(),se= sd_count/sqrt(n),
            .groups = "keep") %>%
  as.data.frame()
head(sparrows_passer2)

  time_year           species sum_count mean_count sd_count     n         se
1      2000 Passer domesticus     58540   2.670377 27.75584 21922 0.18746246
2      2001 Passer domesticus     51747   2.887829 11.15980 17919 0.08336804
3      2002 Passer domesticus     79041   3.896332 12.31492 20286 0.08646361
4      2003 Passer domesticus    115396   4.427070 13.32246 26066 0.08251771
5      2004 Passer domesticus    143474   5.129567 14.42218 27970 0.08623524
6      2005 Passer domesticus    160639   4.920332 14.25601 32648 0.07889868

sparrows_spizella <- sparrows_d %>%
  group_by(time_year,species) %>% 
filter(species =="Spizella pusilla", ) %>%
  dplyr::summarize(sum_count=sum(count),mean_count = mean(count), sd_count = sd(count),n=n(),se= sd_count/sqrt(n)) %>%
  as.data.frame()

`summarise()` has grouped output by 'time_year'. You can override using the
`.groups` argument.

head(sparrows_spizella)

  time_year          species sum_count mean_count  sd_count   n         se
1      1900 Spizella pusilla        98   1.180723 0.7181375  83 0.07882583
2      1901 Spizella pusilla        74   1.042254 0.2638517  71 0.03131343
3      1902 Spizella pusilla       108   1.080000 0.4644971 100 0.04644971
4      1903 Spizella pusilla        97   1.000000 0.0000000  97 0.00000000
5      1904 Spizella pusilla       149   1.103704 0.5363513 135 0.04616177
6      1905 Spizella pusilla       150   1.027397 0.2611699 146 0.02161458

sparrows_spizella2 <- sparrows_d %>%
  group_by(time_year,species) %>% 
filter(species =="Spizella pusilla",between (time_year,2000,2020)) %>%
  dplyr::summarize(sum_count=sum(count),mean_count = mean(count), sd_count = sd(count),n=n(),se= sd_count/sqrt(n)) %>%
  as.data.frame()

`summarise()` has grouped output by 'time_year'. You can override using the
`.groups` argument.

head(sparrows_spizella2)

  time_year          species sum_count mean_count sd_count     n         se
1      2000 Spizella pusilla     11633   2.328929 5.624631  4995 0.07958410
2      2001 Spizella pusilla     11640   2.394569 4.097807  4861 0.05877446
3      2002 Spizella pusilla     14655   2.515880 3.715426  5825 0.04868113
4      2003 Spizella pusilla     17873   2.535897 3.746526  7048 0.04462681
5      2004 Spizella pusilla     21027   2.623783 4.943594  8014 0.05522277
6      2005 Spizella pusilla     25759   2.458858 4.296097 10476 0.04197361

p0 <- ggplot(data = sparrows_passer,aes(y = mean_count, x =time_year)) +
  geom_line()+
  labs(title = "Distribution of Passer domesticus over time", x= "time", y ="mean count")+
  theme_classic()

p1 <- ggplot(data = sparrows_passer2,aes(y = mean_count, x =time_year)) +
  geom_line()+
  labs(title = "Distribution of Passer domesticus from 2000-2020", x= "time", y ="mean_count")+
  theme_bw()
p0+p1

p2 <- ggplot(data = sparrows_spizella,aes(y = mean_count, x =time_year)) +
  geom_line()+
  labs(title = "Distribution of Spizella pusilla over time", x= "time", y ="count")+
  theme_classic()

p3 <- ggplot(data = sparrows_spizella2,aes(y = mean_count, x =time_year)) +
  geom_line()+
  labs(title = "Distribution of Spizella pusilla from 2000-2020", x= "time", y ="count")+
  theme_classic()
p2+p3

The distribution of bird species generally increased from 1900 - 1955. The same can be seen in the early 2005-2012 (very clear on the second plot). I think this would be an interesting trend to investigate / hypothesize later on.

all_birds <- rbind(sparrows_passer2,sparrows_spizella2)
head(all_birds)

  time_year           species sum_count mean_count sd_count     n         se
1      2000 Passer domesticus     58540   2.670377 27.75584 21922 0.18746246
2      2001 Passer domesticus     51747   2.887829 11.15980 17919 0.08336804
3      2002 Passer domesticus     79041   3.896332 12.31492 20286 0.08646361
4      2003 Passer domesticus    115396   4.427070 13.32246 26066 0.08251771
5      2004 Passer domesticus    143474   5.129567 14.42218 27970 0.08623524
6      2005 Passer domesticus    160639   4.920332 14.25601 32648 0.07889868

p5 <- ggplot(data = all_birds,aes(y = sum_count, x = time_year,  color = species)) +
  geom_point()+
  geom_line()+
  labs(title = "Distribution of Sparrows from 2000-2020", x= "species", y ="Total # birds")+
  theme_bw()+
  theme(axis.text.x=element_text(angle = 0, size = 15, vjust = 1), axis.title.y = element_text(size = 16),axis.title.x = element_text(size = 16))+
  theme_classic()
p5+geom_errorbar(aes(ymin=mean_count-se, ymax=mean_count+se), width=.2,
                 position=position_dodge(.9))

The number of sparrows tracked generally increased for the two decades between 2000-2020. The invasive species (Passer domesticus) was greater than the Spizella pusilla which makes sense. But this is not also a very sufficient graph. I am not really sure about the error bars being similar for both species

sparrows_d$lat <- ifelse(sparrows_d$decimalLatitude >10 & sparrows_d$decimalLatitude <20,"10-20",
                         ifelse(sparrows_d$decimalLatitude > 20 & sparrows_d$decimalLatitude< 30, "20-30",
                                ifelse(sparrows_d$decimalLatitude >30 &sparrows_d$decimalLatitude <40,"30-40",
                                ifelse(sparrows_d$decimalLatitude >40 &sparrows_d$decimalLatitude <50,"40-50",
                                       ifelse(sparrows_d$decimalLatitude >50 &sparrows_d$decimalLatitude< 60,"other",NA)))))
head(sparrows_d)

# A tibble: 6 × 10
   ...1 species      day month time_…¹ decim…² decim…³ eventDate           count
  <dbl> <chr>      <dbl> <dbl>   <dbl>   <dbl>   <dbl> <dttm>              <dbl>
1     1 Spizella …     1     8    2012    41.1   -81.4 2012-08-01 00:00:00     1
2     2 Spizella …    11     5    2020    41.5   -72.3 2020-05-11 08:51:00     1
3     3 Spizella …    21     2    2022    30.3   -88.7 2022-02-21 10:08:00     1
4     4 Spizella …    13     7    2019    40.3   -76.2 2019-07-13 00:00:00     1
5     5 Spizella …     4     5    2020    39.9   -86.1 2020-05-04 09:53:08     1
6     6 Spizella …    20     7    2017    36.3   -81.7 2017-07-20 00:00:00     1
# … with 1 more variable: lat <chr>, and abbreviated variable names ¹time_year,
#   ²decimalLatitude, ³decimalLongitude

sparrows_m6 <- sparrows_d %>% group_by(time_year,lat,species) %>% 
  dplyr::summarize(sum_count1=sum(count),
            .groups = 'drop') %>%
  as.data.frame()
head(sparrows_m6)

  time_year   lat           species sum_count1
1      1900 30-40 Passer domesticus          7
2      1900 30-40  Spizella pusilla         16
3      1900 40-50 Passer domesticus         77
4      1900 40-50  Spizella pusilla         82
5      1901 30-40 Passer domesticus          3
6      1901 30-40  Spizella pusilla         14

p6 <- ggplot(data = sparrows_m6,aes(y = sum_count1, x = lat,  color = species)) +
  geom_point()+
  labs(title = "Distribution of Sparrows in space", x= "Latitude range", y ="Total # birds")+
  theme_bw()+
  theme(axis.text.x=element_text(angle = 0, size = 15, vjust = 1), axis.title.y = element_text(size = 16),axis.title.x = element_text(size = 16))+
  theme_classic()

p7 <- ggplot(data = sparrows_m6, mapping = aes(x = lat, color = species))+ 
  geom_density()+
  labs(title = "Distribution of Sparrows in space", x= "Latitude range")+
  theme_classic()
grid.arrange(p6,p7, ncol =1)

The distribution of Passer domesticus is generally higher than that of Spizella pusilla for all latitude ranges (both plots. The first graph however seems to be show this better than the histogram.

this would make sense since we hypothesized that the Passer domesticus is the invasive species and thus would overtime out-compete the Spizella pusilla - but am not sure the counts used are meaningful

DATA VISUALIZATION HYPOTHESIS 2

#counts of species by year and species
#then abundance by ratios

sparrows_m5_2 <- sparrows_d %>%
  filter(time_year > 1999, time_year<2021) %>%
  group_by(time_year,species) %>% 
  dplyr::summarize(sum_count=sum(count),
            .groups = 'drop') %>%
  as.data.frame()

head(sparrows_m5_2)

  time_year           species sum_count
1      2000 Passer domesticus     58540
2      2000  Spizella pusilla     11633
3      2001 Passer domesticus     51747
4      2001  Spizella pusilla     11640
5      2002 Passer domesticus     79041
6      2002  Spizella pusilla     14655

p5 <- ggplot(data = sparrows_m5_2,aes(y = sum_count, x = time_year,  color = species)) +
  geom_point()+
  geom_line()+
  labs(title = "Distribution of Sparrows from 2000-2020", x= "species", y ="Abundance of birds")+
  theme_bw()+
  theme(axis.text.x=element_text(angle = 0, size = 15, vjust = 1), axis.title.y = element_text(size = 16),axis.title.x = element_text(size = 16))+
  theme_classic()
p5

# transforming abundance 
sparrows_m5_2$log_counts <-log10(sparrows_m5_2$sum_count)
head(sparrows_m5_2)

  time_year           species sum_count log_counts
1      2000 Passer domesticus     58540   4.767453
2      2000  Spizella pusilla     11633   4.065692
3      2001 Passer domesticus     51747   4.713885
4      2001  Spizella pusilla     11640   4.065953
5      2002 Passer domesticus     79041   4.897852
6      2002  Spizella pusilla     14655   4.165986

table_1 <- knitr::kable(head(sparrows_m5_2))
table_1

time_year	species	sum_count	log_counts
2000	Passer domesticus	58540	4.767453
2000	Spizella pusilla	11633	4.065692
2001	Passer domesticus	51747	4.713885
2001	Spizella pusilla	11640	4.065953
2002	Passer domesticus	79041	4.897852
2002	Spizella pusilla	14655	4.165986

p5 <- ggplot(data = sparrows_m5_2,aes(y = log_counts, x = time_year,  color = species)) +
  geom_point()+
  geom_line()+
  labs(title = "Distribution of Sparrows from 2000-2021", x= "species", y ="log_abundance of birds")+
  theme_bw()+
  theme(axis.text.x=element_text(angle = 0, size = 15, vjust = 1), axis.title.y = element_text(size = 16),axis.title.x = element_text(size = 16))+
  theme_classic()
p5

There is definitely difference in relative log abundance of sparrows for the two species. Passer domesticus were higher than Spizella pusilla over the years between 2000 to 2021

LINEAR MODELS FOR HYPOTHESIS 2

lm_sparrows <- lm(log_counts~species*time_year, data=sparrows_m5_2)
summary(lm_sparrows)


Call:
lm(formula = log_counts ~ species * time_year, data = sparrows_m5_2)

Residuals:
      Min        1Q    Median        3Q       Max 
-0.152049 -0.038272 -0.000914  0.045682  0.114475 

Coefficients:
                                    Estimate Std. Error t value Pr(>|t|)    
(Intercept)                       -1.954e+02  4.570e+00 -42.759  < 2e-16 ***
speciesSpizella pusilla            1.707e+01  6.463e+00   2.642  0.01191 *  
time_year                          1.001e-01  2.274e-03  44.021  < 2e-16 ***
speciesSpizella pusilla:time_year -8.916e-03  3.215e-03  -2.773  0.00856 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.06309 on 38 degrees of freedom
Multiple R-squared:  0.9931,    Adjusted R-squared:  0.9925 
F-statistic:  1817 on 3 and 38 DF,  p-value: < 2.2e-16

anova(lm_sparrows)

Analysis of Variance Table

Response: log_counts
                  Df  Sum Sq Mean Sq   F value    Pr(>F)    
species            1  7.5789  7.5789 1904.1389 < 2.2e-16 ***
time_year          1 14.0826 14.0826 3538.1441 < 2.2e-16 ***
species:time_year  1  0.0306  0.0306    7.6901  0.008557 ** 
Residuals         38  0.1512  0.0040                        
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

tab_model(lm_sparrows,auto.label =TRUE)

	log_counts
Predictors	Estimates	CI	p
(Intercept)	-195.41	-204.66 – -186.15	<0.001
species [Spizella pusilla]	17.07	3.99 – 30.16	0.012
time year	0.10	0.10 – 0.10	<0.001
species [Spizella pusilla] × time year	-0.01	-0.02 – -0.00	0.009
Observations	42
R² / R² adjusted	0.993 / 0.993

check_model(lm_sparrows)

Variable `Component` is not in your data frame :/

lm_sparrows_1 <- lm(log_counts~species+time_year, data=sparrows_m5_2)
summary(lm_sparrows_1)


Call:
lm(formula = log_counts ~ species + time_year, data = sparrows_m5_2)

Residuals:
      Min        1Q    Median        3Q       Max 
-0.192172 -0.035905  0.006039  0.047578  0.108732 

Coefficients:
                          Estimate Std. Error t value Pr(>|t|)    
(Intercept)             -186.44414    3.49762  -53.31   <2e-16 ***
speciesSpizella pusilla   -0.84959    0.02107  -40.31   <2e-16 ***
time_year                  0.09563    0.00174   54.95   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.06829 on 39 degrees of freedom
Multiple R-squared:  0.9917,    Adjusted R-squared:  0.9912 
F-statistic:  2323 on 2 and 39 DF,  p-value: < 2.2e-16

anova(lm_sparrows_1)

Analysis of Variance Table

Response: log_counts
          Df  Sum Sq Mean Sq F value    Pr(>F)    
species    1  7.5789  7.5789  1625.3 < 2.2e-16 ***
time_year  1 14.0826 14.0826  3020.1 < 2.2e-16 ***
Residuals 39  0.1819  0.0047                      
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

check_model(lm_sparrows_1)

Variable `Component` is not in your data frame :/

tab_model(lm_sparrows_1,auto.label =TRUE)

	log_counts
Predictors	Estimates	CI	p
(Intercept)	-186.44	-193.52 – -179.37	<0.001
species [Spizella pusilla]	-0.85	-0.89 – -0.81	<0.001
time year	0.10	0.09 – 0.10	<0.001
Observations	42
R² / R² adjusted	0.992 / 0.991

CONFIDENCE INTERVALS

tidy(lm_sparrows_1)

# A tibble: 3 × 5
  term                     estimate std.error statistic  p.value
  <chr>                       <dbl>     <dbl>     <dbl>    <dbl>
1 (Intercept)             -186.       3.50        -53.3 4.70e-38
2 speciesSpizella pusilla   -0.850    0.0211      -40.3 2.09e-33
3 time_year                  0.0956   0.00174      55.0 1.46e-38

set.seed(400) #any number is fine
sparrows_intervals<- reg_intervals(log_counts~species+time_year, data=sparrows_m5_2, 
                                   type='percentile',
                                   keep_reps=FALSE)

sparrows_intervals

# A tibble: 2 × 6
  term                     .lower .estimate  .upper .alpha .method   
  <chr>                     <dbl>     <dbl>   <dbl>  <dbl> <chr>     
1 speciesSpizella pusilla -0.892    -0.851  -0.810    0.05 percentile
2 time_year                0.0918    0.0957  0.0998   0.05 percentile

#plot the results
sparrows_boots<-ggplot(data=sparrows_intervals, aes(x=.estimate, y=term))+
  geom_vline(xintercept=0, linetype=2)+
  geom_errorbarh(aes(xmin=.lower, xmax=.upper),height=0.1)+
  geom_point(size=3)+
  theme_classic()
sparrows_boots

time- year has positive effect and being Spizella has negative effect

birds2 <- sparrows_d %>%
  group_by(time_year, species) %>%
  filter(time_year== 2000) %>%
  dplyr::summarise(lat=mean(decimalLatitude),
            long = mean(decimalLongitude)) %>%
  as.data.frame()

`summarise()` has grouped output by 'time_year'. You can override using the
`.groups` argument.

birds2

  time_year           species      lat      long
1      2000 Passer domesticus 40.28972 -80.25492
2      2000  Spizella pusilla 39.35673 -81.20765

us <- map_data("state")
head(us)

       long      lat group order  region subregion
1 -87.46201 30.38968     1     1 alabama      <NA>
2 -87.48493 30.37249     1     2 alabama      <NA>
3 -87.52503 30.37249     1     3 alabama      <NA>
4 -87.53076 30.33239     1     4 alabama      <NA>
5 -87.57087 30.32665     1     5 alabama      <NA>
6 -87.58806 30.32665     1     6 alabama      <NA>

fig1 <- ggplot() +
  geom_polygon(data=us, aes(x=long, y=lat, group = group), color="white", fill="lightblue") +
  labs(title="Sparrow Migrations") +
  geom_point(data = birds2, aes(x =long, y = lat, color = time_year, shape = species, alpha = 1))
print(fig1)