# read data from library HSAUR3
dta_backpain <- HSAUR3::backpain
# cheack the structure
str(dta_backpain)
## 'data.frame': 434 obs. of 4 variables:
## $ ID : Factor w/ 217 levels "1","2","3","4",..: 1 1 2 2 3 3 4 4 5 5 ...
## $ status : Factor w/ 2 levels "case","control": 1 2 1 2 1 2 1 2 1 2 ...
## $ driver : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 1 1 2 2 ...
## $ suburban: Factor w/ 2 levels "no","yes": 2 1 2 2 1 2 1 1 1 2 ...
# print the first 6 lines
head(dta_backpain)
## ID status driver suburban
## 1 1 case yes yes
## 2 1 control yes no
## 3 2 case yes yes
## 4 2 control yes yes
## 5 3 case yes no
## 6 3 control yes yes
# create a summary table and make it a data frame
dta_backpain_table <- as.data.frame(with(dta_backpain, table(status, driver, suburban)))
# print it we can see there are 8 categories and the numbers of people in each category
dta_backpain_table
## status driver suburban Freq
## 1 case no no 26
## 2 control no no 47
## 3 case yes no 64
## 4 control yes no 63
## 5 case no yes 6
## 6 control no yes 7
## 7 case yes yes 121
## 8 control yes yes 100
# make it wide shape
dta_backpain_table_wide <- reshape(dta_backpain_table,
# the columns we don't want to expand
idvar=c("driver", "suburban"),
# the column we want to expand
timevar="status",
# reshape to wide format
direction="wide")
# rename the 3rd and 4th columns
colnames(dta_backpain_table_wide)[3:4] <- c("case", "control")
# compute the total number of each row
dta_backpain_table_wide$total <- dta_backpain_table_wide$case + dta_backpain_table_wide$control
# print it to check the format
dta_backpain_table_wide
## driver suburban case control total
## 1 no no 26 47 73
## 3 yes no 64 63 127
## 5 no yes 6 7 13
## 7 yes yes 121 100 221
# sort the table in the provided order
dta_backpain_table_wide[order(dta_backpain_table_wide$driver), ]
## driver suburban case control total
## 1 no no 26 47 73
## 5 no yes 6 7 13
## 3 yes no 64 63 127
## 7 yes yes 121 100 221
# load tidyverse
library(tidyverse)
dta_backpain %>%
# group these 3 variables
group_by(driver, suburban, status) %>%
# compute the number of subjects in each category
summarise(count = length(status)) %>%
# make it wide shape, col names from status and value from count
pivot_wider(names_from = status, values_from = count) %>%
# compute the total number of each row and store it in the new column named total
mutate(total = case + control)
## # A tibble: 4 x 5
## # Groups: driver, suburban [4]
## driver suburban case control total
## <fct> <fct> <int> <int> <int>
## 1 no no 26 47 73
## 2 no yes 6 7 13
## 3 yes no 64 63 127
## 4 yes yes 121 100 221
dta_state <- as.data.frame(datasets::state.x77)
dta_arrest <- as.data.frame(datasets::USArrests)
# create a column for row names
dta_state$state <- row.names(dta_state)
dta_arrest$state <- row.names(dta_arrest)
# merge two data object by ID "state"
dta_state_arrest_merge <- merge(dta_state, dta_arrest, by = "state", all = T)
head(dta_state_arrest_merge)
## state Population Income Illiteracy Life Exp Murder.x HS Grad Frost
## 1 Alabama 3615 3624 2.1 69.05 15.1 41.3 20
## 2 Alaska 365 6315 1.5 69.31 11.3 66.7 152
## 3 Arizona 2212 4530 1.8 70.55 7.8 58.1 15
## 4 Arkansas 2110 3378 1.9 70.66 10.1 39.9 65
## 5 California 21198 5114 1.1 71.71 10.3 62.6 20
## 6 Colorado 2541 4884 0.7 72.06 6.8 63.9 166
## Area Murder.y Assault UrbanPop Rape
## 1 50708 13.2 236 58 21.2
## 2 566432 10.0 263 48 44.5
## 3 113417 8.1 294 80 31.0
## 4 51945 8.8 190 50 19.5
## 5 156361 9.0 276 91 40.6
## 6 103766 7.9 204 78 38.7
# compute all pair-wise correlations for numerical variables, in other words remove the first column
cor(dta_state_arrest_merge[, -1])
## Population Income Illiteracy Life Exp Murder.x
## Population 1.00000000 0.20822756 0.10762237 -0.06805195 0.34364275
## Income 0.20822756 1.00000000 -0.43707519 0.34025534 -0.23007761
## Illiteracy 0.10762237 -0.43707519 1.00000000 -0.58847793 0.70297520
## Life Exp -0.06805195 0.34025534 -0.58847793 1.00000000 -0.78084575
## Murder.x 0.34364275 -0.23007761 0.70297520 -0.78084575 1.00000000
## HS Grad -0.09848975 0.61993232 -0.65718861 0.58221620 -0.48797102
## Frost -0.33215245 0.22628218 -0.67194697 0.26206801 -0.53888344
## Area 0.02254384 0.36331544 0.07726113 -0.10733194 0.22839021
## Murder.y 0.32024487 -0.21520501 0.70677564 -0.77849850 0.93369089
## Assault 0.31702281 0.04093255 0.51101299 -0.62665800 0.73976479
## UrbanPop 0.51260491 0.48053302 -0.06219936 0.27146824 0.01638255
## Rape 0.30523361 0.35738678 0.15459686 -0.26956828 0.57996132
## HS Grad Frost Area Murder.y Assault
## Population -0.09848975 -0.3321525 0.02254384 0.32024487 0.31702281
## Income 0.61993232 0.2262822 0.36331544 -0.21520501 0.04093255
## Illiteracy -0.65718861 -0.6719470 0.07726113 0.70677564 0.51101299
## Life Exp 0.58221620 0.2620680 -0.10733194 -0.77849850 -0.62665800
## Murder.x -0.48797102 -0.5388834 0.22839021 0.93369089 0.73976479
## HS Grad 1.00000000 0.3667797 0.33354187 -0.52159126 -0.23030510
## Frost 0.36677970 1.0000000 0.05922910 -0.54139702 -0.46823989
## Area 0.33354187 0.0592291 1.00000000 0.14808597 0.23120879
## Murder.y -0.52159126 -0.5413970 0.14808597 1.00000000 0.80187331
## Assault -0.23030510 -0.4682399 0.23120879 0.80187331 1.00000000
## UrbanPop 0.35868123 -0.2461862 -0.06154747 0.06957262 0.25887170
## Rape 0.27072504 -0.2792054 0.52495510 0.56357883 0.66524123
## UrbanPop Rape
## Population 0.51260491 0.3052336
## Income 0.48053302 0.3573868
## Illiteracy -0.06219936 0.1545969
## Life Exp 0.27146824 -0.2695683
## Murder.x 0.01638255 0.5799613
## HS Grad 0.35868123 0.2707250
## Frost -0.24618618 -0.2792054
## Area -0.06154747 0.5249551
## Murder.y 0.06957262 0.5635788
## Assault 0.25887170 0.6652412
## UrbanPop 1.00000000 0.4113412
## Rape 0.41134124 1.0000000
The data set concerns species and weight of animals caught in plots in a study area in Arizona over time.
Each row holds information for a single animal, and the columns represent:
# load package
pacman::p_load(tidyverse)
# read data from url
dta <- read_csv("http://kbroman.org/datacarp/portal_data_joined.csv")
# print the number of rows
# print the number of cols
# print the col names
# print the data type of each col
# print some example entries of each col
glimpse(dta)
## Observations: 34,786
## Variables: 13
## $ record_id <dbl> 1, 72, 224, 266, 349, 363, 435, 506, 588, 661, 748,...
## $ month <dbl> 7, 8, 9, 10, 11, 11, 12, 1, 2, 3, 4, 5, 6, 8, 9, 10...
## $ day <dbl> 16, 19, 13, 16, 12, 12, 10, 8, 18, 11, 8, 6, 9, 5, ...
## $ year <dbl> 1977, 1977, 1977, 1977, 1977, 1977, 1977, 1978, 197...
## $ plot_id <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...
## $ species_id <chr> "NL", "NL", "NL", "NL", "NL", "NL", "NL", "NL", "NL...
## $ sex <chr> "M", "M", NA, NA, NA, NA, NA, NA, "M", NA, NA, "M",...
## $ hindfoot_length <dbl> 32, 31, NA, NA, NA, NA, NA, NA, NA, NA, NA, 32, NA,...
## $ weight <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 218, NA, NA, 204, 2...
## $ genus <chr> "Neotoma", "Neotoma", "Neotoma", "Neotoma", "Neotom...
## $ species <chr> "albigula", "albigula", "albigula", "albigula", "al...
## $ taxa <chr> "Rodent", "Rodent", "Rodent", "Rodent", "Rodent", "...
## $ plot_type <chr> "Control", "Control", "Control", "Control", "Contro...
# check dimensions of dta
dim(dta)
## [1] 34786 13
# print the first 6 lines of three varibales, plot_id, species_id, weight.
dplyr::select(dta, plot_id, species_id, weight) %>% head()
## # A tibble: 6 x 3
## plot_id species_id weight
## <dbl> <chr> <dbl>
## 1 2 NL NA
## 2 2 NL NA
## 3 2 NL NA
## 4 2 NL NA
## 5 2 NL NA
## 6 2 NL NA
# print the first 6 lines of dta besides two variables, record_id, species_id
dplyr::select(dta, -record_id, -species_id) %>% head()
## # A tibble: 6 x 11
## month day year plot_id sex hindfoot_length weight genus species taxa
## <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <chr> <chr> <chr>
## 1 7 16 1977 2 M 32 NA Neot~ albigu~ Rode~
## 2 8 19 1977 2 M 31 NA Neot~ albigu~ Rode~
## 3 9 13 1977 2 <NA> NA NA Neot~ albigu~ Rode~
## 4 10 16 1977 2 <NA> NA NA Neot~ albigu~ Rode~
## 5 11 12 1977 2 <NA> NA NA Neot~ albigu~ Rode~
## 6 11 12 1977 2 <NA> NA NA Neot~ albigu~ Rode~
## # ... with 1 more variable: plot_type <chr>
# the first 6 lines of dta from year 1995
dplyr::filter(dta, year == 1995) %>% head()
## # A tibble: 6 x 13
## record_id month day year plot_id species_id sex hindfoot_length weight
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
## 1 22314 6 7 1995 2 NL M 34 NA
## 2 22728 9 23 1995 2 NL F 32 165
## 3 22899 10 28 1995 2 NL F 32 171
## 4 23032 12 2 1995 2 NL F 33 NA
## 5 22003 1 11 1995 2 DM M 37 41
## 6 22042 2 4 1995 2 DM F 36 45
## # ... with 4 more variables: genus <chr>, species <chr>, taxa <chr>,
## # plot_type <chr>
# remove all the entries that have more than 5 in weight
# select 3 column to display, weight, species_id, sex
# print the first 6 lines
head(dplyr::select(dplyr::filter(dta, weight <= 5), species_id, sex, weight))
## # A tibble: 6 x 3
## species_id sex weight
## <chr> <chr> <dbl>
## 1 PF M 5
## 2 PF F 5
## 3 PF F 5
## 4 PF F 4
## 5 PF F 5
## 6 PF F 4
dta %>%
# remove the rows where weight is larger than 5
dplyr::filter(weight <= 5) %>%
# select 3 variables, species_id, sex and weight to display
dplyr::select(species_id, sex, weight) %>%
# print the first 6 lines
head
## # A tibble: 6 x 3
## species_id sex weight
## <chr> <chr> <dbl>
## 1 PF M 5
## 2 PF F 5
## 3 PF F 5
## 4 PF F 4
## 5 PF F 5
## 6 PF F 4
dta %>%
# create two new columns
# weight_kg = weight / 100
# weight_lb = weight_kg * 2.2
mutate(weight_kg = weight / 1000,
weight_lb = weight_kg * 2.2) %>%
# print the first 6 lines
head()
## # A tibble: 6 x 15
## record_id month day year plot_id species_id sex hindfoot_length weight
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
## 1 1 7 16 1977 2 NL M 32 NA
## 2 72 8 19 1977 2 NL M 31 NA
## 3 224 9 13 1977 2 NL <NA> NA NA
## 4 266 10 16 1977 2 NL <NA> NA NA
## 5 349 11 12 1977 2 NL <NA> NA NA
## 6 363 11 12 1977 2 NL <NA> NA NA
## # ... with 6 more variables: genus <chr>, species <chr>, taxa <chr>,
## # plot_type <chr>, weight_kg <dbl>, weight_lb <dbl>
dta %>%
# remove all the entries that have na value in column weight
filter(!is.na(weight)) %>%
# group the data by sex and species_id
group_by(sex, species_id) %>%
# compute the mean of column weight within the group which was specified in the above line
summarize(mean_weight = mean(weight)) %>%
# sort the data by column mean_weight in an descending order
arrange(desc(mean_weight)) %>%
# print the first 6 lines
head()
## # A tibble: 6 x 3
## # Groups: sex [3]
## sex species_id mean_weight
## <chr> <chr> <dbl>
## 1 <NA> NL 168.
## 2 M NL 166.
## 3 F NL 154.
## 4 M SS 130
## 5 <NA> SH 130
## 6 M DS 122.
dta %>%
# group the data by the variable sex
group_by(sex) %>%
# compute the freq within the group
tally
## # A tibble: 3 x 2
## sex n
## <chr> <int>
## 1 F 15690
## 2 M 17348
## 3 <NA> 1748
dta %>%
# compute the freq of the variable sex
count(sex)
## # A tibble: 3 x 2
## sex n
## <chr> <int>
## 1 F 15690
## 2 M 17348
## 3 <NA> 1748
dta %>%
group_by(sex) %>%
summarize(count = n())
## # A tibble: 3 x 2
## sex count
## <chr> <int>
## 1 F 15690
## 2 M 17348
## 3 <NA> 1748
dta %>%
group_by(sex) %>%
# compute the freq of sex using varible year
summarize(count = sum(!is.na(year)))
## # A tibble: 3 x 2
## sex count
## <chr> <int>
## 1 F 15690
## 2 M 17348
## 3 <NA> 1748
# !is.na(year) return a logical vector within which one FALSE represents a missing value
# when you compute a logical vector all the TRUEs would become 1 and all the FALSEs would become 0
# therefore, when you perform a summation for a logical vector, it will return an integer which means how many TRUEs in it
# in this case, it returns the number of valid entries.
dta_gw <- dta %>%
# remove all the rows with na value in column weight
filter(!is.na(weight)) %>%
# group the data by variables genus and plot_id
group_by(genus, plot_id) %>%
# compute the mean of column weight within the group
summarize(mean_weight = mean(weight))
# print the number of rows and cols
# print the data type of each col
# print some example entries of each col
glimpse(dta_gw)
## Observations: 196
## Variables: 3
## Groups: genus [10]
## $ genus <chr> "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Baiomys", ...
## $ plot_id <dbl> 1, 2, 3, 5, 18, 19, 20, 21, 1, 2, 3, 4, 5, 6, 7, 8, 9, ...
## $ mean_weight <dbl> 7.000000, 6.000000, 8.611111, 7.750000, 9.500000, 9.533...
dta_w <- dta_gw %>%
# transform the data into a wide form
# which means using the value of gunus as columns
# and put the value of mean_weight in the correspoding entries
spread(key = genus, value = mean_weight)
# rows drops from 196 to 24
# cols increases from 3 to 11
glimpse(dta_w)
## Observations: 24
## Variables: 11
## $ plot_id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...
## $ Baiomys <dbl> 7.000000, 6.000000, 8.611111, NA, 7.750000, NA, NA,...
## $ Chaetodipus <dbl> 22.19939, 25.11014, 24.63636, 23.02381, 17.98276, 2...
## $ Dipodomys <dbl> 60.23214, 55.68259, 52.04688, 57.52454, 51.11356, 5...
## $ Neotoma <dbl> 156.2222, 169.1436, 158.2414, 164.1667, 190.0370, 1...
## $ Onychomys <dbl> 27.67550, 26.87302, 26.03241, 28.09375, 27.01695, 2...
## $ Perognathus <dbl> 9.625000, 6.947368, 7.507812, 7.824427, 8.658537, 7...
## $ Peromyscus <dbl> 22.22222, 22.26966, 21.37037, 22.60000, 21.23171, 2...
## $ Reithrodontomys <dbl> 11.375000, 10.680556, 10.516588, 10.263158, 11.1545...
## $ Sigmodon <dbl> NA, 70.85714, 65.61404, 82.00000, 82.66667, 68.7777...
## $ Spermophilus <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
dta_gw %>%
# make a wide form
# replace missing values by 0
spread(genus, mean_weight, fill = 0) %>%
# print the first 6 lines
head()
## # A tibble: 6 x 11
## plot_id Baiomys Chaetodipus Dipodomys Neotoma Onychomys Perognathus Peromyscus
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 7 22.2 60.2 156. 27.7 9.62 22.2
## 2 2 6 25.1 55.7 169. 26.9 6.95 22.3
## 3 3 8.61 24.6 52.0 158. 26.0 7.51 21.4
## 4 4 0 23.0 57.5 164. 28.1 7.82 22.6
## 5 5 7.75 18.0 51.1 190. 27.0 8.66 21.2
## 6 6 0 24.9 58.6 180. 25.9 7.81 21.8
## # ... with 3 more variables: Reithrodontomys <dbl>, Sigmodon <dbl>,
## # Spermophilus <dbl>
dta_l <- dta_w %>%
# make a long form
# stored all cols besides plot_id into a new col named genus
# stored all values besides those in col plot_id into a new col named mean_weight
gather(key = genus, value = mean_weight, -plot_id)
# rows increases from 24 to 240
# cols drops form 11 to 3
glimpse(dta_l)
## Observations: 240
## Variables: 3
## $ plot_id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, ...
## $ genus <chr> "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Baiomys", ...
## $ mean_weight <dbl> 7.000000, 6.000000, 8.611111, NA, 7.750000, NA, NA, NA,...
I have no idea why the number of rows in the long form is 240. I expected that the number should have been equal to dta_gw that is 196.
dta_w %>%
# make a long form
# creat a new col named genus
# store col from Baiomys to Spermophilus into genus
# put the corresponding values into a new colunmn mean_weight
gather(key = genus, value = mean_weight, Baiomys:Spermophilus) %>%
head()
## # A tibble: 6 x 3
## plot_id genus mean_weight
## <dbl> <chr> <dbl>
## 1 1 Baiomys 7
## 2 2 Baiomys 6
## 3 3 Baiomys 8.61
## 4 4 Baiomys NA
## 5 5 Baiomys 7.75
## 6 6 Baiomys NA
dta_complete <- dta %>%
# remove all entries with na value in column weight
filter(!is.na(weight),
# remove all entries with na value in column hindfoot_length
!is.na(hindfoot_length),
# remove all entries with na value in column sex
!is.na(sex))
species_counts <- dta_complete %>%
# compute freq for each category in column species_id
count(species_id) %>%
# remove all the entries with value less than 50 in column n
filter(n >= 50)
dta_complete <- dta_complete %>%
# remove the entries in data object dta_complete where the column species_id has the same value in the column species_id in data object species_counts
filter(species_id %in% species_counts$species_id)
library(car)
data(Vocab)
dta_vocab <- Vocab
head(aggregate(data = dta_vocab, .~year + sex, mean))
## year sex education vocabulary
## 1 1974 Female 11.82429 6.077519
## 2 1976 Female 11.60710 6.140684
## 3 1978 Female 11.78978 6.019744
## 4 1982 Female 12.10911 5.765766
## 5 1984 Female 12.38200 6.070560
## 6 1987 Female 12.45627 5.752371
dta_vocab %>% group_by(year, sex) %>%
summarise(mean_edu = mean(education), mean_vocab = mean(vocabulary)) %>% head()
## # A tibble: 6 x 4
## # Groups: year [3]
## year sex mean_edu mean_vocab
## <dbl> <fct> <dbl> <dbl>
## 1 1974 Female 11.8 6.08
## 2 1974 Male 11.9 5.96
## 3 1976 Female 11.6 6.14
## 4 1976 Male 12.1 5.95
## 5 1978 Female 11.8 6.02
## 6 1978 Male 12.4 5.89
# read data
dta_animal <- MASS::Animals
dta_mammal <- MASS::mammals
# make row names a column
dta_animal$names <- row.names(dta_animal)
dta_mammal$names <- row.names(dta_mammal)
# combind two data objects by rows
dta <- rbind(dta_animal, dta_mammal)
# see how many duplicated observations in dta
sum(duplicated(dta$names))
## [1] 23
# remove duplicated observations
dta <- dta[-which(duplicated(dta$names)), ]
# check data structure, observations drops from 90 to 67
str(dta)
## 'data.frame': 67 obs. of 3 variables:
## $ body : num 1.35 465 36.33 27.66 1.04 ...
## $ brain: num 8.1 423 119.5 115 5.5 ...
## $ names: chr "Mountain beaver" "Cow" "Grey wolf" "Goat" ...
## read data
dta_probe <- read.table("probeL.txt", header = T)
dta_probe_w <- reshape(dta_probe,
idvar = "ID",
# the variabel we want to expand
timevar = "Position",
# the value comes from column Response_Time
v.names = "Response_Time",
# direction from long to wide
direction = "wide")
# rename cols
colnames(dta_probe_w)[-1] <- paste("Pos_", 1:5, sep = "")
# print the first 11 lines
head(dta_probe_w, 11)
## ID Pos_1 Pos_2 Pos_3 Pos_4 Pos_5
## 1 S01 51 36 50 35 42
## 6 S02 27 20 26 17 27
## 11 S03 37 22 41 37 30
## 16 S04 42 36 32 34 27
## 21 S05 27 18 33 14 29
## 26 S06 43 32 43 35 40
## 31 S07 41 22 36 25 38
## 36 S08 38 21 31 20 16
## 41 S09 36 23 27 25 28
## 46 S10 26 31 31 32 36
## 51 S11 29 20 25 26 25
# make it a wide form
dta_probe_w <- dta_probe %>% spread(key = Position, value = Response_Time)
# rename cols
colnames(dta_probe_w)[-1] <- paste("Pos_", 1:5, sep = "")
# print the first 11 lines
head(dta_probe_w, 11)
## ID Pos_1 Pos_2 Pos_3 Pos_4 Pos_5
## 1 S01 51 36 50 35 42
## 2 S02 27 20 26 17 27
## 3 S03 37 22 41 37 30
## 4 S04 42 36 32 34 27
## 5 S05 27 18 33 14 29
## 6 S06 43 32 43 35 40
## 7 S07 41 22 36 25 38
## 8 S08 38 21 31 20 16
## 9 S09 36 23 27 25 28
## 10 S10 26 31 31 32 36
## 11 S11 29 20 25 26 25