In [3]:
library(tidyverse, verbose = FALSE)
df = readxl::read_excel('datasets.xlsx', sheet = 'wrangling', range = 'p7:T11')
df
str(df)
A tibble: 4 × 5
EducationFootballHockeyCricketBaseball
<chr><dbl><dbl><dbl><dbl>
SSC 342520 7
HSC 41815 9
Bachelor10192621
Masters 5162328
tibble [4 × 5] (S3: tbl_df/tbl/data.frame)
 $ Education: chr [1:4] "SSC" "HSC" "Bachelor" "Masters"
 $ Football : num [1:4] 34 4 10 5
 $ Hockey   : num [1:4] 25 18 19 16
 $ Cricket  : num [1:4] 20 15 26 23
 $ Baseball : num [1:4] 7 9 21 28
In [4]:
# We will subset this dataframe (tibble) in different ways
# Select only Football and Hockey columns
df %>% select(c(Football, Hockey))
A tibble: 4 × 2
FootballHockey
<dbl><dbl>
3425
418
1019
516
In [5]:
# Filter only SSC row
df %>% filter(Education == 'SSC')
A tibble: 1 × 5
EducationFootballHockeyCricketBaseball
<chr><dbl><dbl><dbl><dbl>
SSC3425207
In [6]:
# Filter SSC and HSC rows
df %>% filter(Education == 'SSC' | Education == 'HSC')
A tibble: 2 × 5
EducationFootballHockeyCricketBaseball
<chr><dbl><dbl><dbl><dbl>
SSC3425207
HSC 418159
In [7]:
# Alternate way of indexing using [row, column], 1:4 indicates 1 to 4
# Example, select first two columns and three rows
df[1:3, 1:2]
A tibble: 3 × 2
EducationFootball
<chr><dbl>
SSC 34
HSC 4
Bachelor10
In [8]:
# Select only Education column (first column), i.e. all acolumns
df[,1]
A tibble: 4 × 1
Education
<chr>
SSC
HSC
Bachelor
Masters
In [9]:
# Select all columns except first two columns
df[, -c(1, 2)]
A tibble: 4 × 3
HockeyCricketBaseball
<dbl><dbl><dbl>
2520 7
1815 9
192621
162328
In [10]:
# Now we will convert the first column as row names to make it a contingency table
# First remove the first column
df[, -1]
A tibble: 4 × 4
FootballHockeyCricketBaseball
<dbl><dbl><dbl><dbl>
342520 7
41815 9
10192621
5162328
In [11]:
# Extract the values of the first column
df$Education
  1. 'SSC'
  2. 'HSC'
  3. 'Bachelor'
  4. 'Masters'
In [29]:
# Use this values as the row names
# Give a name (cont.table) to the contingency table
cont.table = df %>% remove_rownames %>% column_to_rownames(var = 'Education')
cont.table
A data.frame: 4 × 4
FootballHockeyCricketBaseball
<dbl><dbl><dbl><dbl>
SSC342520 7
HSC 41815 9
Bachelor10192621
Masters 5162328
In [30]:
# We can also bring these row names to column i.e. convert to a dataframe.
# See the changes in the dataframe from 4*4 to 4*5
dataframe = cont.table %>% rownames_to_column(var = 'Education')
dataframe
A data.frame: 4 × 5
EducationFootballHockeyCricketBaseball
<chr><dbl><dbl><dbl><dbl>
SSC 342520 7
HSC 41815 9
Bachelor10192621
Masters 5162328