Data Analysis using R

Week2 - Day 1 material

———————————————————————–

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(readr)
dataFdr="D:\\D Drive\\Certificate Course\\data"
filename="combined.csv"
dataFile=paste(dataFdr,filename,sep="\\")
survey=read_csv(dataFile)
## Rows: 34786 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): species_id, sex, genus, species, taxa, plot_type
## dbl (7): record_id, month, day, year, plot_id, hindfoot_length, weight
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
filename="legal_weed_age_GSS2016_ch1.csv"
dataFile=paste(dataFdr,filename,sep="\\")
##dataFile

mar=read.csv(dataFile)

mar_cleaned=mar%>%mutate(grass=factor(grass))%>%
  mutate(grass=if_else(grass=="DK"|grass=="IAP",NA,grass))%>%
  mutate(grass=droplevels(grass))%>%
  mutate(age=recode(age,"89 OR OLDER"="89"))%>%
  mutate(age=as.numeric(age))%>%
  mutate(age_cat=cut(age,c(-Inf,30,60,75,Inf),                                                     c("<30","30-59","60-74","75+")))
View(mar_cleaned)
summary(mar_cleaned)
##        grass           age         age_cat    
##  LEGAL    :1126   Min.   :18.00   <30  : 535  
##  NOT LEGAL: 717   1st Qu.:34.00   30-59:1516  
##  NA's     :1024   Median :49.00   60-74: 564  
##                   Mean   :49.16   75+  : 242  
##                   3rd Qu.:62.00   NA's :  10  
##                   Max.   :89.00               
##                   NA's   :10
filename="combined.csv"
dataFile=paste(dataFdr,filename,sep="\\")
survey=read_csv(dataFile)
## Rows: 34786 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): species_id, sex, genus, species, taxa, plot_type
## dbl (7): record_id, month, day, year, plot_id, hindfoot_length, weight
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(survey)
##  [1] "record_id"       "month"           "day"             "year"           
##  [5] "plot_id"         "species_id"      "sex"             "hindfoot_length"
##  [9] "weight"          "genus"           "species"         "taxa"           
## [13] "plot_type"
View(survey)
survey_cln=drop_na(survey)
#scatter plot with transparency
ggplot(survey_cln,aes(x=weight,y=hindfoot_length))+geom_point(alpha=.1)

#scatter plot with pattern
ggplot(survey_cln,aes(x=weight,y=hindfoot_length))+geom_point()+geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

#contour Plot
ggplot(survey_cln,aes(x=weight,y=hindfoot_length))+geom_density2d()

data_yr=survey_cln%>%group_by(year,genus)%>%count()
#View(data_yr)
#Timeseries Plot
ggplot(data_yr,aes(x=year,y=n))+geom_line()

#Timeseries Plot with multiple group
ggplot(data_yr,aes(x=year,y=n,group=genus))+geom_line()

#Timeseries Plot with multiple group with different colors
ggplot(data_yr,aes(x=year,y=n,colour=genus))+geom_line()

#arranged in multiple rows and columns with same axis measure
ggplot(data_yr,aes(x=year,y=n))+geom_line()+facet_wrap(~genus)

#arranged in multiple rows and columns with different axis measure
ggplot(data_yr,aes(x=year,y=n))+geom_line()+facet_wrap(~genus,scales="free")

data_yr=survey_cln%>%group_by(year,genus,sex)%>%count()

ggplot(data_yr,aes(x=year,y=n,colour = sex))+geom_line()+facet_wrap(~genus)

ggplot(data_yr,aes(x=year,y=n,colour = sex))+geom_line()+facet_grid(genus~sex)

#arranged in single row and multiple columns 
ggplot(data_yr,aes(x=year,y=n))+geom_line()+facet_grid(genus~sex)

#Time series plot with labels
ggplot(data_yr,aes(x=year,y=n,colour=sex))+geom_line()+facet_grid(genus~sex)+labs(title="Observed genera over time", x="Year of observation",y="Number of Animals")

ggplot(data_yr,aes(x=year,y=n,colour=sex))+geom_line()+facet_grid(genus~sex)+labs(title="Observed genera over time", x="Year of observation",y="Number of Animals")+
  theme(axis.text.x = element_text(size=7,angle=90),
        axis.text.y=element_text(size=7),
        strip.text = element_text(size=7,angle=45))

#Histogram plot with title
ggplot(survey_cln,aes(x=weight))+geom_histogram()+labs(title="Weight Distribution of Animals", x="Weight",y="Frequency")+theme_bw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Histogram plot with title and colour
ggplot(survey_cln,aes(x=weight,fill=sex))+geom_histogram(bins=100)+labs(title="Weight Distribution of Animals", x="Weight",y="Frequency")+theme_bw()

#Histogram plot with title,color and facet wrap
ggplot(survey_cln,aes(x=weight,fill=sex))+geom_histogram(binsize=20)+labs(title="Weight Distribution of Animals", x="Weight",y="Frequency")+theme_bw()+facet_wrap(~sex)
## Warning in geom_histogram(binsize = 20): Ignoring unknown parameters: `binsize`
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(survey_cln,aes(x=weight,colour=sex))+geom_freqpoly()+labs(title="Distribution of animal by weight", x="Weight",y="Frequency")+theme_bw()+facet_wrap(~sex)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(survey_cln,aes(x=genus,fill=genus))+geom_bar()+labs(title="Distribution of Animals by genera", x="Species",y="Frequency")+
  theme(axis.text.x=element_text(size=7,angle=90))

mar_cleaned=mar_cleaned%>%drop_na(grass)

#names(mar_cleaned)
#Bar Plot

ggplot(data,aes(x=knowledge_stat,y=knowledge_prog))+ geom_point(position=position_jitter(h=0.1, w=0.1),shape = 21, alpha = 0.5, size = 3,color=“red”) + lims(x=c(0,5),y=c(0,5)) + theme_classic() + coord_fixed() + geom_vline(xintercept = 2.5) + geom_hline(yintercept = 2.5)+ labs(x=“Knowledge in Statistics”,y=“Knowledge in Programming”, title=“Participants Distribution”)

```