project_datavisualization_2

Author

Dinah Marion Abeja

library(readr)
library(lubridate)

Attaching package: 'lubridate'
The following objects are masked from 'package:base':

    date, intersect, setdiff, union
library(tidyverse)
── Attaching packages
───────────────────────────────────────
tidyverse 1.3.2 ──
✔ ggplot2 3.4.0     ✔ dplyr   1.1.0
✔ tibble  3.1.8     ✔ stringr 1.5.0
✔ tidyr   1.2.1     ✔ forcats 0.5.2
✔ purrr   1.0.1     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ lubridate::as.difftime() masks base::as.difftime()
✖ lubridate::date()        masks base::date()
✖ dplyr::filter()          masks stats::filter()
✖ lubridate::intersect()   masks base::intersect()
✖ dplyr::lag()             masks stats::lag()
✖ lubridate::setdiff()     masks base::setdiff()
✖ lubridate::union()       masks base::union()
library(zoo)

Attaching package: 'zoo'

The following objects are masked from 'package:base':

    as.Date, as.Date.numeric
library(plotly)

Attaching package: 'plotly'

The following object is masked from 'package:ggplot2':

    last_plot

The following object is masked from 'package:stats':

    filter

The following object is masked from 'package:graphics':

    layout
library(dplyr)
library(ggsci)
library(patchwork)
library(gridExtra)

Attaching package: 'gridExtra'

The following object is masked from 'package:dplyr':

    combine
sparrow_m<-read_csv("birds 11.38.48 AM.csv")
New names:
Rows: 6092602 Columns: 9
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(1): species dbl (7): ...1, day, month, year, decimalLatitude,
decimalLongitude, count dttm (1): eventDate
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `` -> `...1`
head(sparrow_m)
# A tibble: 6 × 9
   ...1 species        day month  year decim…¹ decim…² eventDate           count
  <dbl> <chr>        <dbl> <dbl> <dbl>   <dbl>   <dbl> <dttm>              <dbl>
1     1 Spizella pu…     1     8  2012    41.1   -81.4 2012-08-01 00:00:00     1
2     2 Spizella pu…    11     5  2020    41.5   -72.3 2020-05-11 08:51:00     1
3     3 Spizella pu…    21     2  2022    30.3   -88.7 2022-02-21 10:08:00     1
4     4 Spizella pu…    13     7  2019    40.3   -76.2 2019-07-13 00:00:00     1
5     5 Spizella pu…     4     5  2020    39.9   -86.1 2020-05-04 09:53:08     1
6     6 Spizella pu…    20     7  2017    36.3   -81.7 2017-07-20 00:00:00     1
# … with abbreviated variable names ¹​decimalLatitude, ²​decimalLongitude
sparrows_d <- sparrow_m 
colnames(sparrows_d)[5] = "time_year"
head(sparrows_d)
# A tibble: 6 × 9
   ...1 species      day month time_…¹ decim…² decim…³ eventDate           count
  <dbl> <chr>      <dbl> <dbl>   <dbl>   <dbl>   <dbl> <dttm>              <dbl>
1     1 Spizella …     1     8    2012    41.1   -81.4 2012-08-01 00:00:00     1
2     2 Spizella …    11     5    2020    41.5   -72.3 2020-05-11 08:51:00     1
3     3 Spizella …    21     2    2022    30.3   -88.7 2022-02-21 10:08:00     1
4     4 Spizella …    13     7    2019    40.3   -76.2 2019-07-13 00:00:00     1
5     5 Spizella …     4     5    2020    39.9   -86.1 2020-05-04 09:53:08     1
6     6 Spizella …    20     7    2017    36.3   -81.7 2017-07-20 00:00:00     1
# … with abbreviated variable names ¹​time_year, ²​decimalLatitude,
#   ³​decimalLongitude
x_min<- min(sparrow_m[,6])
x_max<- max(sparrow_m[,6])
x_min
[1] 24.53307
x_max
[1] 48.14776
#counts of species by year and species

sparrows_m5 <- sparrows_d %>% 
  group_by(time_year,species) %>% 
  dplyr::summarize(sum_count=sum(count),
            .groups = 'drop') %>%
  as.data.frame()

head(sparrows_m5)
  time_year           species sum_count
1      1900 Passer domesticus        84
2      1900  Spizella pusilla        98
3      1901 Passer domesticus        16
4      1901  Spizella pusilla        74
5      1902 Passer domesticus        13
6      1902  Spizella pusilla       108
y_min<- min(sparrows_m5[,3])
y_max<- max(sparrows_m5[,3])
y_min
[1] 3
y_max
[1] 5479090
sparrows_passer <- sparrows_d %>%
  group_by(time_year,species) %>% 
filter(species =="Passer domesticus", ) %>%
  dplyr::summarize(sum_count=sum(count),mean_count = mean(count), sd_count = sd(count),n=n(),se= sd_count/sqrt(n)) %>%
  as.data.frame()
`summarise()` has grouped output by 'time_year'. You can override using the
`.groups` argument.
head(sparrows_passer)
  time_year           species sum_count mean_count  sd_count  n        se
1      1900 Passer domesticus        84   1.333333 2.4029552 63 0.3027439
2      1901 Passer domesticus        16   1.333333 1.1547005 12 0.3333333
3      1902 Passer domesticus        13   1.181818 0.6030227 11 0.1818182
4      1903 Passer domesticus        28   1.333333 1.0645813 21 0.2323107
5      1904 Passer domesticus        22   1.157895 0.6882472 19 0.1578947
6      1905 Passer domesticus        41   1.464286 1.2904820 28 0.2438782
sparrows_passer2 <- sparrows_d %>%
  group_by(time_year,species) %>% 
filter(species =="Passer domesticus",between (time_year,2000,2020)) %>%
  dplyr::summarize(sum_count=sum(count),mean_count = mean(count), sd_count = sd(count),n=n(),se= sd_count/sqrt(n),
            .groups = "keep") %>%
  as.data.frame()
head(sparrows_passer2)
  time_year           species sum_count mean_count sd_count     n         se
1      2000 Passer domesticus     58540   2.670377 27.75584 21922 0.18746246
2      2001 Passer domesticus     51747   2.887829 11.15980 17919 0.08336804
3      2002 Passer domesticus     79041   3.896332 12.31492 20286 0.08646361
4      2003 Passer domesticus    115396   4.427070 13.32246 26066 0.08251771
5      2004 Passer domesticus    143474   5.129567 14.42218 27970 0.08623524
6      2005 Passer domesticus    160639   4.920332 14.25601 32648 0.07889868
sparrows_spizella <- sparrows_d %>%
  group_by(time_year,species) %>% 
filter(species =="Spizella pusilla", ) %>%
  dplyr::summarize(sum_count=sum(count),mean_count = mean(count), sd_count = sd(count),n=n(),se= sd_count/sqrt(n)) %>%
  as.data.frame()
`summarise()` has grouped output by 'time_year'. You can override using the
`.groups` argument.
head(sparrows_spizella)
  time_year          species sum_count mean_count  sd_count   n         se
1      1900 Spizella pusilla        98   1.180723 0.7181375  83 0.07882583
2      1901 Spizella pusilla        74   1.042254 0.2638517  71 0.03131343
3      1902 Spizella pusilla       108   1.080000 0.4644971 100 0.04644971
4      1903 Spizella pusilla        97   1.000000 0.0000000  97 0.00000000
5      1904 Spizella pusilla       149   1.103704 0.5363513 135 0.04616177
6      1905 Spizella pusilla       150   1.027397 0.2611699 146 0.02161458
sparrows_spizella2 <- sparrows_d %>%
  group_by(time_year,species) %>% 
filter(species =="Spizella pusilla",between (time_year,2000,2020)) %>%
  dplyr::summarize(sum_count=sum(count),mean_count = mean(count), sd_count = sd(count),n=n(),se= sd_count/sqrt(n)) %>%
  as.data.frame()
`summarise()` has grouped output by 'time_year'. You can override using the
`.groups` argument.
head(sparrows_spizella2)
  time_year          species sum_count mean_count sd_count     n         se
1      2000 Spizella pusilla     11633   2.328929 5.624631  4995 0.07958410
2      2001 Spizella pusilla     11640   2.394569 4.097807  4861 0.05877446
3      2002 Spizella pusilla     14655   2.515880 3.715426  5825 0.04868113
4      2003 Spizella pusilla     17873   2.535897 3.746526  7048 0.04462681
5      2004 Spizella pusilla     21027   2.623783 4.943594  8014 0.05522277
6      2005 Spizella pusilla     25759   2.458858 4.296097 10476 0.04197361
p0 <- ggplot(data = sparrows_passer,aes(y = mean_count, x =time_year)) +
  geom_line()+
  labs(title = "Distribution of Passer domesticus over time", x= "time", y ="mean count")+
  theme_classic()

p1 <- ggplot(data = sparrows_passer2,aes(y = mean_count, x =time_year)) +
  geom_line()+
  labs(title = "Distribution of Passer domesticus from 2000-2020", x= "time", y ="mean_count")+
  theme_bw()
p0+p1

p2 <- ggplot(data = sparrows_spizella,aes(y = mean_count, x =time_year)) +
  geom_line()+
  labs(title = "Distribution of Spizella pusilla over time", x= "time", y ="count")+
  theme_classic()

p3 <- ggplot(data = sparrows_spizella2,aes(y = mean_count, x =time_year)) +
  geom_line()+
  labs(title = "Distribution of Spizella pusilla from 2000-2020", x= "time", y ="count")+
  theme_classic()
p2+p3

The distribution of bird species generally increased from 1900 - 1955. The same can be seen in the early 2005-2012 (very clear on the second plot). I think this would be an interesting trend to investigate / hypothesize later on.

all_birds <- rbind(sparrows_passer2,sparrows_spizella2)
head(all_birds)
  time_year           species sum_count mean_count sd_count     n         se
1      2000 Passer domesticus     58540   2.670377 27.75584 21922 0.18746246
2      2001 Passer domesticus     51747   2.887829 11.15980 17919 0.08336804
3      2002 Passer domesticus     79041   3.896332 12.31492 20286 0.08646361
4      2003 Passer domesticus    115396   4.427070 13.32246 26066 0.08251771
5      2004 Passer domesticus    143474   5.129567 14.42218 27970 0.08623524
6      2005 Passer domesticus    160639   4.920332 14.25601 32648 0.07889868
p5 <- ggplot(data = all_birds,aes(y = sum_count, x = time_year,  color = species)) +
  geom_point()+
  geom_line()+
  labs(title = "Distribution of Sparrows from 2000-2020", x= "species", y ="Total # birds")+
  theme_bw()+
  theme(axis.text.x=element_text(angle = 0, size = 15, vjust = 1), axis.title.y = element_text(size = 16),axis.title.x = element_text(size = 16))+
  theme_classic()
p5+geom_errorbar(aes(ymin=mean_count-se, ymax=mean_count+se), width=.2,
                 position=position_dodge(.9)) 

The number of sparrows tracked generally increased for the two decades between 2000-2020. The invasive species (Passer domesticus) was greater than the Spizella pusilla which makes sense. But this is not also a very sufficient graph. I am not really sure about the error bars being similar for both species

sparrows_d$lat <- ifelse(sparrows_d$decimalLatitude >10 & sparrows_d$decimalLatitude <20,"10-20",
                         ifelse(sparrows_d$decimalLatitude > 20 & sparrows_d$decimalLatitude< 30, "20-30",
                                ifelse(sparrows_d$decimalLatitude >30 &sparrows_d$decimalLatitude <40,"30-40",
                                ifelse(sparrows_d$decimalLatitude >40 &sparrows_d$decimalLatitude <50,"40-50",
                                       ifelse(sparrows_d$decimalLatitude >50 &sparrows_d$decimalLatitude< 60,"other",NA)))))
head(sparrows_d)
# A tibble: 6 × 10
   ...1 species      day month time_…¹ decim…² decim…³ eventDate           count
  <dbl> <chr>      <dbl> <dbl>   <dbl>   <dbl>   <dbl> <dttm>              <dbl>
1     1 Spizella …     1     8    2012    41.1   -81.4 2012-08-01 00:00:00     1
2     2 Spizella …    11     5    2020    41.5   -72.3 2020-05-11 08:51:00     1
3     3 Spizella …    21     2    2022    30.3   -88.7 2022-02-21 10:08:00     1
4     4 Spizella …    13     7    2019    40.3   -76.2 2019-07-13 00:00:00     1
5     5 Spizella …     4     5    2020    39.9   -86.1 2020-05-04 09:53:08     1
6     6 Spizella …    20     7    2017    36.3   -81.7 2017-07-20 00:00:00     1
# … with 1 more variable: lat <chr>, and abbreviated variable names ¹​time_year,
#   ²​decimalLatitude, ³​decimalLongitude
sparrows_m6 <- sparrows_d %>% group_by(time_year,lat,species) %>% 
  dplyr::summarize(sum_count1=sum(count),
            .groups = 'drop') %>%
  as.data.frame()
head(sparrows_m6)
  time_year   lat           species sum_count1
1      1900 30-40 Passer domesticus          7
2      1900 30-40  Spizella pusilla         16
3      1900 40-50 Passer domesticus         77
4      1900 40-50  Spizella pusilla         82
5      1901 30-40 Passer domesticus          3
6      1901 30-40  Spizella pusilla         14
p6 <- ggplot(data = sparrows_m6,aes(y = sum_count1, x = lat,  color = species)) +
  geom_point()+
  labs(title = "Distribution of Sparrows in space", x= "Latitude range", y ="Total # birds")+
  theme_bw()+
  theme(axis.text.x=element_text(angle = 0, size = 15, vjust = 1), axis.title.y = element_text(size = 16),axis.title.x = element_text(size = 16))+
  theme_classic()

p7 <- ggplot(data = sparrows_m6, mapping = aes(x = lat, color = species))+ 
  geom_density()+
  labs(title = "Distribution of Sparrows in space", x= "Latitude range")+
  theme_classic()
grid.arrange(p6,p7, ncol =1)

The distribution of Passer domesticus is generally higher than that of Spizella pusilla for all latitude ranges (both plots. The first graph however seems to be show this better than the histogram.

this would make sense since we hypothesized that the Passer domesticus is the invasive species and thus would overtime out-compete the Spizella pusilla - but am not sure the counts used are meaningful

DATA VISUALIZATION

Initial Plot from Audrey’s Work

birds_grouped_latitude_year <-sparrow_m %>%
filter(year > 2000) %>%
group_by(species, month) %>%
summarise(mean = mean(decimalLatitude), sd = sd(decimalLatitude), n = n(), se = sd/sqrt(n))
`summarise()` has grouped output by 'species'. You can override using the
`.groups` argument.
ggplot(birds_grouped_latitude_year, mapping = aes(month, mean, color = species)) +
geom_path() +
geom_errorbar(aes(ymin = mean - se, ymax = mean + se), width = .1) +
labs(x = 'Month', y = 'Latitude') +
ggtitle(label = "Average Latitude of Sparrows by year") +
scale_color_npg()+
theme_classic()
Warning: Removed 2 rows containing missing values (`geom_path()`).

#counts of species by year and species
#then abundance by ratios

sparrows_m5_2 <- sparrows_d %>%
  filter(time_year > 1999, time_year<2021) %>%
  group_by(time_year,species) %>% 
  dplyr::summarize(sum_count=sum(count),
            .groups = 'drop') %>%
  as.data.frame()

head(sparrows_m5_2)
  time_year           species sum_count
1      2000 Passer domesticus     58540
2      2000  Spizella pusilla     11633
3      2001 Passer domesticus     51747
4      2001  Spizella pusilla     11640
5      2002 Passer domesticus     79041
6      2002  Spizella pusilla     14655
p5 <- ggplot(data = sparrows_m5_2,aes(y = sum_count, x = time_year,  color = species)) +
  geom_point()+
  geom_line()+
  labs(title = "Distribution of Sparrows from 2000-2020", x= "species", y ="Abundance of birds")+
  theme_bw()+
  theme(axis.text.x=element_text(angle = 0, size = 15, vjust = 1), axis.title.y = element_text(size = 16),axis.title.x = element_text(size = 16))+
  theme_classic()
p5

# transforming abundance
sparrows_m5_2$log_counts <-log10(sparrows_m5_2$sum_count)
head(sparrows_m5_2)
  time_year           species sum_count log_counts
1      2000 Passer domesticus     58540   4.767453
2      2000  Spizella pusilla     11633   4.065692
3      2001 Passer domesticus     51747   4.713885
4      2001  Spizella pusilla     11640   4.065953
5      2002 Passer domesticus     79041   4.897852
6      2002  Spizella pusilla     14655   4.165986
p5 <- ggplot(data = sparrows_m5_2,aes(y = log_counts, x = time_year,  color = species)) +
  geom_point()+
  geom_line()+
  labs(title = "Distribution of Sparrows from 2000-2021", x= "species", y ="log_abundance of birds")+
  theme_bw()+
  theme(axis.text.x=element_text(angle = 0, size = 15, vjust = 1), axis.title.y = element_text(size = 16),axis.title.x = element_text(size = 16))+
  theme_classic()
p5

There is definitely difference in relative log abundance of sparrows for the two species. Passer domesticus were higher than Spizella pusilla over the years between 2000 to 2021