Ph.D. Cpurse Work -2024 on Quantitative Methods

Day 5 material

———————————————————————–

rm(list=ls())
#Get directory
getwd()

## [1] "D:/D Drive/Ph.D. Course Work/Ph.D. 2024"

#Set directory
setwd("D:\\D Drive\\Ph.D. Course Work\\Ph.D. 2024\\Data")
getwd()

## [1] "D:/D Drive/Ph.D. Course Work/Ph.D. 2024/Data"

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(ggplot2)
library(readr)
# Load csv files into R object
survey=read_csv("combined.csv")

## Rows: 34786 Columns: 13

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): species_id, sex, genus, species, taxa, plot_type
## dbl (7): record_id, month, day, year, plot_id, hindfoot_length, weight
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

mar=read.csv("legal_weed_age_GSS2016_ch1.csv")

mar_cleaned=mar%>%mutate(grass=factor(grass))%>%
  mutate(grass=if_else(grass=="DK"|grass=="IAP",NA,grass))%>%
  mutate(grass=droplevels(grass))%>%
  mutate(age=recode(age,"89 OR OLDER"="89"))%>%
  mutate(age=as.numeric(age))%>%
  mutate(age_cat=cut(age,c(-Inf,30,60,75,Inf),                                                     c("<30","30-59","60-74","75+")))
View(mar_cleaned)
summary(mar_cleaned)

##        grass           age         age_cat    
##  LEGAL    :1126   Min.   :18.00   <30  : 535  
##  NOT LEGAL: 717   1st Qu.:34.00   30-59:1516  
##  NA's     :1024   Median :49.00   60-74: 564  
##                   Mean   :49.16   75+  : 242  
##                   3rd Qu.:62.00   NA's :  10  
##                   Max.   :89.00               
##                   NA's   :10

names(survey)

##  [1] "record_id"       "month"           "day"             "year"           
##  [5] "plot_id"         "species_id"      "sex"             "hindfoot_length"
##  [9] "weight"          "genus"           "species"         "taxa"           
## [13] "plot_type"

View(survey)
survey_cln=drop_na(survey)
#scatter plot with transparency
ggplot(survey_cln,aes(x=weight,y=hindfoot_length))+geom_point(alpha=.1)

#scatter plot with pattern
ggplot(survey_cln,aes(x=weight,y=hindfoot_length))+geom_point()+geom_smooth()

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

#contour Plot
ggplot(survey_cln,aes(x=weight,y=hindfoot_length))+geom_density2d()

data_yr=survey_cln%>%group_by(year,genus)%>%count()
#View(data_yr)
#Timeseries Plot
ggplot(data_yr,aes(x=year,y=n))+geom_line()

#Timeseries Plot with multiple group
ggplot(data_yr,aes(x=year,y=n,group=genus))+geom_line()

#Timeseries Plot with multiple group with different colors
ggplot(data_yr,aes(x=year,y=n,colour=genus))+geom_line()

#arranged in multiple rows and columns with same axis measure
ggplot(data_yr,aes(x=year,y=n))+geom_line()+facet_wrap(~genus)

#arranged in multiple rows and columns with different axis measure
ggplot(data_yr,aes(x=year,y=n))+geom_line()+facet_wrap(~genus,scales="free")

data_yr=survey_cln%>%group_by(year,genus,sex)%>%count()

ggplot(data_yr,aes(x=year,y=n,colour = sex))+geom_line()+facet_wrap(~genus)

ggplot(data_yr,aes(x=year,y=n,colour = sex))+geom_line()+facet_grid(genus~sex)

#arranged in single row and multiple columns 
ggplot(data_yr,aes(x=year,y=n))+geom_line()+facet_grid(genus~sex)

#Time series plot with labels
ggplot(data_yr,aes(x=year,y=n,colour=sex))+geom_line()+facet_grid(genus~sex)+labs(title="Observed genera over time", x="Year of observation",y="Number of Animals")

ggplot(data_yr,aes(x=year,y=n,colour=sex))+geom_line()+facet_grid(genus~sex)+labs(title="Observed genera over time", x="Year of observation",y="Number of Animals")+
  theme(axis.text.x = element_text(size=7,angle=90),
        axis.text.y=element_text(size=7),
        strip.text = element_text(size=7,angle=45))

#Histogram plot with title
ggplot(survey_cln,aes(x=weight))+geom_histogram()+labs(title="Weight Distribution of Animals", x="Weight",y="Frequency")+theme_bw()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Histogram plot with title and colour
ggplot(survey_cln,aes(x=weight,fill=sex))+geom_histogram(bins=100)+labs(title="Weight Distribution of Animals", x="Weight",y="Frequency")+theme_bw()

#Histogram plot with title,color and facet wrap
ggplot(survey_cln,aes(x=weight,fill=sex))+geom_histogram(binsize=20)+labs(title="Weight Distribution of Animals", x="Weight",y="Frequency")+theme_bw()+facet_wrap(~sex)

## Warning in geom_histogram(binsize = 20): Ignoring unknown parameters: `binsize`

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(survey_cln,aes(x=weight,colour=sex))+geom_freqpoly()+labs(title="Distribution of animal by weight", x="Weight",y="Frequency")+theme_bw()+facet_wrap(~sex)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(survey_cln,aes(x=genus,fill=genus))+geom_bar()+labs(title="Distribution of Animals by genera", x="Species",y="Frequency")+
  theme(axis.text.x=element_text(size=7,angle=90))

mar_cleaned=mar_cleaned%>%drop_na(grass)

data = mar_cleaned%>%
  group_by(grass , age_cat )%>%count()%>%
  group_by(age_cat) %>%
  mutate (per_cnt=100*n/sum(n))

ggplot(data,aes(x=age_cat,y=per_cnt,fill=grass))+
  geom_col(position="dodge")+
  theme_minimal()+
  scale_fill_manual(values=c("#31a354","#e6550d"))+
  labs(title="Should marijuana be legal?",
       x= "Age Category",y="Count in %")