———————————————————————–
rm(list=ls())
#Get directory
getwd()
## [1] "D:/D Drive/Ph.D. Course Work/Ph.D. 2024"
#Set directory
setwd("D:\\D Drive\\Ph.D. Course Work\\Ph.D. 2024\\Data")
getwd()
## [1] "D:/D Drive/Ph.D. Course Work/Ph.D. 2024/Data"
# Load combined.csv file into R object
library(readr)
survey=read_csv("combined.csv")
## Rows: 34786 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): species_id, sex, genus, species, taxa, plot_type
## dbl (7): record_id, month, day, year, plot_id, hindfoot_length, weight
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#View() Returns contents of the object in a tabular form
View(survey)
#str() returns the structure of the object.
str(survey)
## spc_tbl_ [34,786 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ record_id : num [1:34786] 1 72 224 266 349 363 435 506 588 661 ...
## $ month : num [1:34786] 7 8 9 10 11 11 12 1 2 3 ...
## $ day : num [1:34786] 16 19 13 16 12 12 10 8 18 11 ...
## $ year : num [1:34786] 1977 1977 1977 1977 1977 ...
## $ plot_id : num [1:34786] 2 2 2 2 2 2 2 2 2 2 ...
## $ species_id : chr [1:34786] "NL" "NL" "NL" "NL" ...
## $ sex : chr [1:34786] "M" "M" NA NA ...
## $ hindfoot_length: num [1:34786] 32 31 NA NA NA NA NA NA NA NA ...
## $ weight : num [1:34786] NA NA NA NA NA NA NA NA 218 NA ...
## $ genus : chr [1:34786] "Neotoma" "Neotoma" "Neotoma" "Neotoma" ...
## $ species : chr [1:34786] "albigula" "albigula" "albigula" "albigula" ...
## $ taxa : chr [1:34786] "Rodent" "Rodent" "Rodent" "Rodent" ...
## $ plot_type : chr [1:34786] "Control" "Control" "Control" "Control" ...
## - attr(*, "spec")=
## .. cols(
## .. record_id = col_double(),
## .. month = col_double(),
## .. day = col_double(),
## .. year = col_double(),
## .. plot_id = col_double(),
## .. species_id = col_character(),
## .. sex = col_character(),
## .. hindfoot_length = col_double(),
## .. weight = col_double(),
## .. genus = col_character(),
## .. species = col_character(),
## .. taxa = col_character(),
## .. plot_type = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
# Observe that the structure of the object is also present in the environment
# names() returns the column names of the data frame
names(survey)
## [1] "record_id" "month" "day" "year"
## [5] "plot_id" "species_id" "sex" "hindfoot_length"
## [9] "weight" "genus" "species" "taxa"
## [13] "plot_type"
# rownames() returns the row names of the data frame
#rownames(survey)
# dim() returns Returns a vector where 1st element is no of rows and 2nd element is no of columns.
dim(survey)
## [1] 34786 13
# nrow() returns the number of rows of data frame.
nrow(survey)
## [1] 34786
# ncol() returns the number of columns of data frame.
ncol(survey)
## [1] 13
# summary() returns summary statistics of each column of the data frame.
summary(survey)
## record_id month day year plot_id
## Min. : 1 Min. : 1.000 Min. : 1.0 Min. :1977 Min. : 1.00
## 1st Qu.: 8964 1st Qu.: 4.000 1st Qu.: 9.0 1st Qu.:1984 1st Qu.: 5.00
## Median :17762 Median : 6.000 Median :16.0 Median :1990 Median :11.00
## Mean :17804 Mean : 6.474 Mean :16.1 Mean :1990 Mean :11.34
## 3rd Qu.:26655 3rd Qu.:10.000 3rd Qu.:23.0 3rd Qu.:1997 3rd Qu.:17.00
## Max. :35548 Max. :12.000 Max. :31.0 Max. :2002 Max. :24.00
##
## species_id sex hindfoot_length weight
## Length:34786 Length:34786 Min. : 2.00 Min. : 4.00
## Class :character Class :character 1st Qu.:21.00 1st Qu.: 20.00
## Mode :character Mode :character Median :32.00 Median : 37.00
## Mean :29.29 Mean : 42.67
## 3rd Qu.:36.00 3rd Qu.: 48.00
## Max. :70.00 Max. :280.00
## NA's :3348 NA's :2503
## genus species taxa plot_type
## Length:34786 Length:34786 Length:34786 Length:34786
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
# Select species_id column for first 2 rows.
survey[1:2,6]
## # A tibble: 2 × 1
## species_id
## <chr>
## 1 NL
## 2 NL
# select all the male animals surveyed in the year 1980
survey[survey$sex=="M" & survey$year==1980,]
## # A tibble: 770 × 13
## record_id month day year plot_id species_id sex hindfoot_length weight
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
## 1 2555 3 9 1980 2 DM M NA 42
## 2 2622 3 9 1980 2 DM M NA 43
## 3 2631 3 9 1980 2 DM M NA 44
## 4 2643 3 9 1980 2 DM M NA 43
## 5 2653 3 9 1980 2 DM M NA 51
## 6 2662 3 9 1980 2 DM M NA 47
## 7 2692 3 9 1980 2 DM M NA 45
## 8 2695 3 9 1980 2 DM M NA 45
## 9 2902 4 18 1980 2 DM M NA 48
## 10 2906 4 18 1980 2 DM M NA 43
## # ℹ 760 more rows
## # ℹ 4 more variables: genus <chr>, species <chr>, taxa <chr>, plot_type <chr>
# select the last row of the data
survey[nrow(survey),]
## # A tibble: 1 × 13
## record_id month day year plot_id species_id sex hindfoot_length weight
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
## 1 30986 7 1 2000 7 PX <NA> NA NA
## # ℹ 4 more variables: genus <chr>, species <chr>, taxa <chr>, plot_type <chr>
# select the middle row of the data.
survey[nrow(survey)/2,]
## # A tibble: 1 × 13
## record_id month day year plot_id species_id sex hindfoot_length weight
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
## 1 9828 1 19 1985 14 AB <NA> NA NA
## # ℹ 4 more variables: genus <chr>, species <chr>, taxa <chr>, plot_type <chr>
# observe the result of summary command on survey data frame
summary(survey)
## record_id month day year plot_id
## Min. : 1 Min. : 1.000 Min. : 1.0 Min. :1977 Min. : 1.00
## 1st Qu.: 8964 1st Qu.: 4.000 1st Qu.: 9.0 1st Qu.:1984 1st Qu.: 5.00
## Median :17762 Median : 6.000 Median :16.0 Median :1990 Median :11.00
## Mean :17804 Mean : 6.474 Mean :16.1 Mean :1990 Mean :11.34
## 3rd Qu.:26655 3rd Qu.:10.000 3rd Qu.:23.0 3rd Qu.:1997 3rd Qu.:17.00
## Max. :35548 Max. :12.000 Max. :31.0 Max. :2002 Max. :24.00
##
## species_id sex hindfoot_length weight
## Length:34786 Length:34786 Min. : 2.00 Min. : 4.00
## Class :character Class :character 1st Qu.:21.00 1st Qu.: 20.00
## Mode :character Mode :character Median :32.00 Median : 37.00
## Mean :29.29 Mean : 42.67
## 3rd Qu.:36.00 3rd Qu.: 48.00
## Max. :70.00 Max. :280.00
## NA's :3348 NA's :2503
## genus species taxa plot_type
## Length:34786 Length:34786 Length:34786 Length:34786
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
# Create factors for the variables taxa and genus
survey$taxa=as.factor(survey$taxa)
survey$genus=as.factor(survey$genus)
# observe the result of summary command on survey data frame and see the difference.
summary(survey)
## record_id month day year plot_id
## Min. : 1 Min. : 1.000 Min. : 1.0 Min. :1977 Min. : 1.00
## 1st Qu.: 8964 1st Qu.: 4.000 1st Qu.: 9.0 1st Qu.:1984 1st Qu.: 5.00
## Median :17762 Median : 6.000 Median :16.0 Median :1990 Median :11.00
## Mean :17804 Mean : 6.474 Mean :16.1 Mean :1990 Mean :11.34
## 3rd Qu.:26655 3rd Qu.:10.000 3rd Qu.:23.0 3rd Qu.:1997 3rd Qu.:17.00
## Max. :35548 Max. :12.000 Max. :31.0 Max. :2002 Max. :24.00
##
## species_id sex hindfoot_length weight
## Length:34786 Length:34786 Min. : 2.00 Min. : 4.00
## Class :character Class :character 1st Qu.:21.00 1st Qu.: 20.00
## Mode :character Mode :character Median :32.00 Median : 37.00
## Mean :29.29 Mean : 42.67
## 3rd Qu.:36.00 3rd Qu.: 48.00
## Max. :70.00 Max. :280.00
## NA's :3348 NA's :2503
## genus species taxa plot_type
## Dipodomys :16167 Length:34786 Bird : 450 Length:34786
## Chaetodipus : 6029 Class :character Rabbit : 75 Class :character
## Onychomys : 3267 Mode :character Reptile: 14 Mode :character
## Reithrodontomys: 2694 Rodent :34247
## Peromyscus : 2234
## Perognathus : 1629
## (Other) : 2766
# Create a data frame survey_200 containing only the row 200 of the survey dataset.
survey_200=survey[200,]
survey_200
## # A tibble: 1 × 13
## record_id month day year plot_id species_id sex hindfoot_length weight
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
## 1 35212 12 7 2002 2 NL M 33 248
## # ℹ 4 more variables: genus <fct>, species <chr>, taxa <fct>, plot_type <chr>
# Combine nrow() and exclusion(-) notation to reproduce the behavior of head(survey).
survey_head=survey[-(7:nrow(survey)),]
survey_head
## # A tibble: 6 × 13
## record_id month day year plot_id species_id sex hindfoot_length weight
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
## 1 1 7 16 1977 2 NL M 32 NA
## 2 72 8 19 1977 2 NL M 31 NA
## 3 224 9 13 1977 2 NL <NA> NA NA
## 4 266 10 16 1977 2 NL <NA> NA NA
## 5 349 11 12 1977 2 NL <NA> NA NA
## 6 363 11 12 1977 2 NL <NA> NA NA
## # ℹ 4 more variables: genus <fct>, species <chr>, taxa <fct>, plot_type <chr>
head(survey)
## # A tibble: 6 × 13
## record_id month day year plot_id species_id sex hindfoot_length weight
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
## 1 1 7 16 1977 2 NL M 32 NA
## 2 72 8 19 1977 2 NL M 31 NA
## 3 224 9 13 1977 2 NL <NA> NA NA
## 4 266 10 16 1977 2 NL <NA> NA NA
## 5 349 11 12 1977 2 NL <NA> NA NA
## 6 363 11 12 1977 2 NL <NA> NA NA
## # ℹ 4 more variables: genus <fct>, species <chr>, taxa <fct>, plot_type <chr>
# How many rabbits were observed?
nrow(survey[survey$taxa=="Rabbit",])
## [1] 75
# How many different genera in the genus column.
unique(survey$genus)
## [1] Neotoma Dipodomys Perognathus Peromyscus
## [5] Chaetodipus Sigmodon Onychomys Spermophilus
## [9] Reithrodontomys Sylvilagus Ammospermophilus Amphispiza
## [13] Calamospiza Callipepla Rodent Pipilo
## [17] Lizard Baiomys Campylorhynchus Crotalus
## [21] Sparrow Pooecetes Ammodramus Zonotrichia
## [25] Sceloporus Cnemidophorus
## 26 Levels: Ammodramus Ammospermophilus Amphispiza Baiomys ... Zonotrichia
#select
View(survey)
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#select verb: select(df,column1,column2,…)
select(survey, year, species_id)
## # A tibble: 34,786 × 2
## year species_id
## <dbl> <chr>
## 1 1977 NL
## 2 1977 NL
## 3 1977 NL
## 4 1977 NL
## 5 1977 NL
## 6 1977 NL
## 7 1977 NL
## 8 1978 NL
## 9 1978 NL
## 10 1978 NL
## # ℹ 34,776 more rows
#filter verb : filter(df,filter condition)
filter(survey,sex=="M")
## # A tibble: 17,348 × 13
## record_id month day year plot_id species_id sex hindfoot_length weight
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
## 1 1 7 16 1977 2 NL M 32 NA
## 2 72 8 19 1977 2 NL M 31 NA
## 3 588 2 18 1978 2 NL M NA 218
## 4 845 5 6 1978 2 NL M 32 204
## 5 990 6 9 1978 2 NL M NA 200
## 6 1164 8 5 1978 2 NL M 34 199
## 7 1261 9 4 1978 2 NL M 32 197
## 8 1453 11 5 1978 2 NL M NA 218
## 9 1756 4 29 1979 2 NL M 33 166
## 10 1818 5 30 1979 2 NL M 32 184
## # ℹ 17,338 more rows
## # ℹ 4 more variables: genus <fct>, species <chr>, taxa <fct>, plot_type <chr>
#is.na() returns true if the column has NA values, otherwise false
#is.na(select(survey,sex))
#Create data frame survey1 using species_id , weight , hindfoot_length and year.
survey1=select(survey,species_id , weight , hindfoot_length, year)
#Create data frame survey2 containing the data where year > 1995 from survey1
survey2=filter(survey1,year>1995)
View(survey2)
#survey2 can be created by a single step by combining the previous two command together
survey2=filter((select(survey,species_id , weight , hindfoot_length, year)),year>1995)
View(survey2)
# pipe operator(%>%) can be used to simplify the above code.
survey2=survey%>%select(species_id , weight , hindfoot_length, year)%>%
filter(year>1995)
View(survey2)
# drop_na() command deletes the NA values from the data frame
# drop_na() is difined under tidyr library
library(tidyr)
drop_na(survey)
## # A tibble: 30,676 × 13
## record_id month day year plot_id species_id sex hindfoot_length weight
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
## 1 845 5 6 1978 2 NL M 32 204
## 2 1164 8 5 1978 2 NL M 34 199
## 3 1261 9 4 1978 2 NL M 32 197
## 4 1756 4 29 1979 2 NL M 33 166
## 5 1818 5 30 1979 2 NL M 32 184
## 6 1882 7 4 1979 2 NL M 32 206
## 7 2133 10 25 1979 2 NL F 33 274
## 8 2184 11 17 1979 2 NL F 30 186
## 9 2406 1 16 1980 2 NL F 33 184
## 10 3000 5 18 1980 2 NL F 31 87
## # ℹ 30,666 more rows
## # ℹ 4 more variables: genus <fct>, species <chr>, taxa <fct>, plot_type <chr>
#Create a new data frame surveyCln by deleting all NA values from survey
surveyCln=drop_na(survey)
# mutate verb:
surveyCln%>%mutate(weight=weight/1000)
## # A tibble: 30,676 × 13
## record_id month day year plot_id species_id sex hindfoot_length weight
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
## 1 845 5 6 1978 2 NL M 32 0.204
## 2 1164 8 5 1978 2 NL M 34 0.199
## 3 1261 9 4 1978 2 NL M 32 0.197
## 4 1756 4 29 1979 2 NL M 33 0.166
## 5 1818 5 30 1979 2 NL M 32 0.184
## 6 1882 7 4 1979 2 NL M 32 0.206
## 7 2133 10 25 1979 2 NL F 33 0.274
## 8 2184 11 17 1979 2 NL F 30 0.186
## 9 2406 1 16 1980 2 NL F 33 0.184
## 10 3000 5 18 1980 2 NL F 31 0.087
## # ℹ 30,666 more rows
## # ℹ 4 more variables: genus <fct>, species <chr>, taxa <fct>, plot_type <chr>
# arrange verb:
surveyCln%>%arrange(year)
## # A tibble: 30,676 × 13
## record_id month day year plot_id species_id sex hindfoot_length weight
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
## 1 226 9 13 1977 2 DM M 37 51
## 2 233 9 13 1977 2 DM M 25 44
## 3 245 10 16 1977 2 DM M 37 39
## 4 251 10 16 1977 2 DM M 36 49
## 5 257 10 16 1977 2 DM M 37 47
## 6 259 10 16 1977 2 DM M 36 41
## 7 268 10 16 1977 2 DM F 36 55
## 8 346 11 12 1977 2 DM F 37 36
## 9 350 11 12 1977 2 DM M 37 47
## 10 354 11 12 1977 2 DM M 38 44
## # ℹ 30,666 more rows
## # ℹ 4 more variables: genus <fct>, species <chr>, taxa <fct>, plot_type <chr>
surveyCln%>%arrange(desc(year))
## # A tibble: 30,676 × 13
## record_id month day year plot_id species_id sex hindfoot_length weight
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
## 1 33415 2 9 2002 2 NL M 33 120
## 2 33583 3 13 2002 2 NL M 32 136
## 3 33586 3 13 2002 2 NL F 32 196
## 4 33847 4 17 2002 2 NL F 30 149
## 5 33966 5 15 2002 2 NL F 31 188
## 6 34198 6 15 2002 2 NL F 31 139
## 7 34783 10 5 2002 2 NL M 36 226
## 8 34991 11 9 2002 2 NL M 33 238
## 9 35212 12 7 2002 2 NL M 33 248
## 10 33329 1 12 2002 2 DM M 37 47
## # ℹ 30,666 more rows
## # ℹ 4 more variables: genus <fct>, species <chr>, taxa <fct>, plot_type <chr>
# group_by verb:
surveyCln%>%group_by(sex)
## # A tibble: 30,676 × 13
## # Groups: sex [2]
## record_id month day year plot_id species_id sex hindfoot_length weight
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
## 1 845 5 6 1978 2 NL M 32 204
## 2 1164 8 5 1978 2 NL M 34 199
## 3 1261 9 4 1978 2 NL M 32 197
## 4 1756 4 29 1979 2 NL M 33 166
## 5 1818 5 30 1979 2 NL M 32 184
## 6 1882 7 4 1979 2 NL M 32 206
## 7 2133 10 25 1979 2 NL F 33 274
## 8 2184 11 17 1979 2 NL F 30 186
## 9 2406 1 16 1980 2 NL F 33 184
## 10 3000 5 18 1980 2 NL F 31 87
## # ℹ 30,666 more rows
## # ℹ 4 more variables: genus <fct>, species <chr>, taxa <fct>, plot_type <chr>
# summarize verb:
surveyCln%>%group_by(sex)%>%summarize(meanw=mean(weight), stdw=sd(weight))
## # A tibble: 2 × 3
## sex meanw stdw
## <chr> <dbl> <dbl>
## 1 F 41.5 36.2
## 2 M 42.1 35.2
#Load the file legal_weed_age_GSS2016_ch1.csv into R data frame.
mardf=read_csv("legal_weed_age_GSS2016_ch1.csv")
## Rows: 2867 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): grass, age
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#Clean the data frame by executing the following rules.
#Redefine the column grass as factor.
mardf$grass=as.factor(mardf$grass)
summary(mardf)
## grass age
## DK : 110 Length:2867
## IAP : 911 Class :character
## LEGAL :1126 Mode :character
## NOT LEGAL: 717
## NA's : 3
#Update the grass values “DK” and “IAP” by NA.
#Hint: Use if_else clase to update the values
#Syntax: if_else(condition, value1,value2)
mardf_cln=mardf%>%mutate(grass=if_else(grass=="DK"|grass=="IAP",NA,grass))
summary(mardf_cln)
## grass age
## DK : 0 Length:2867
## IAP : 0 Class :character
## LEGAL :1126 Mode :character
## NOT LEGAL: 717
## NA's :1024
#Update the age value “89 OR OLDER” by “89”
mardf_cln=mardf_cln%>%mutate(age=if_else(age=="89 OR OLDER","89",age))
#Redefine the age column as numeric.
mardf_cln$age=as.numeric(mardf_cln$age)
#Create a column age_cat containing 4 categories <30, 30-59, 60-74, 75+ based on the values of age column.
#Hint: Use cut function to split the age values into different levels.
#syntax: cut(object, vector_cat, vector_level)
mardf_cln=mardf_cln%>%mutate(age_cat=cut(age,c(-Inf,30,60,75,Inf),
c("<30","30-59","60-74","75+")))
mardf_cln
## # A tibble: 2,867 × 3
## grass age age_cat
## <fct> <dbl> <fct>
## 1 <NA> 47 30-59
## 2 LEGAL 61 60-74
## 3 NOT LEGAL 72 60-74
## 4 <NA> 43 30-59
## 5 LEGAL 55 30-59
## 6 LEGAL 53 30-59
## 7 <NA> 50 30-59
## 8 NOT LEGAL 23 <30
## 9 <NA> 45 30-59
## 10 NOT LEGAL 71 60-74
## # ℹ 2,857 more rows
#Execute the summary function on the cleaned data.
summary(mardf_cln)
## grass age age_cat
## DK : 0 Min. :18.00 <30 : 535
## IAP : 0 1st Qu.:34.00 30-59:1516
## LEGAL :1126 Median :49.00 60-74: 564
## NOT LEGAL: 717 Mean :49.16 75+ : 242
## NA's :1024 3rd Qu.:62.00 NA's : 10
## Max. :89.00
## NA's :10