In [3]:
library(tidyverse, verbose = FALSE)
df = readxl::read_excel('datasets.xlsx', sheet = 'wrangling', range = 'p7:T11')
df
str(df)
| Education | Football | Hockey | Cricket | Baseball |
|---|---|---|---|---|
| <chr> | <dbl> | <dbl> | <dbl> | <dbl> |
| SSC | 34 | 25 | 20 | 7 |
| HSC | 4 | 18 | 15 | 9 |
| Bachelor | 10 | 19 | 26 | 21 |
| Masters | 5 | 16 | 23 | 28 |
tibble [4 × 5] (S3: tbl_df/tbl/data.frame) $ Education: chr [1:4] "SSC" "HSC" "Bachelor" "Masters" $ Football : num [1:4] 34 4 10 5 $ Hockey : num [1:4] 25 18 19 16 $ Cricket : num [1:4] 20 15 26 23 $ Baseball : num [1:4] 7 9 21 28
In [4]:
# We will subset this dataframe (tibble) in different ways
# Select only Football and Hockey columns
df %>% select(c(Football, Hockey))
| Football | Hockey |
|---|---|
| <dbl> | <dbl> |
| 34 | 25 |
| 4 | 18 |
| 10 | 19 |
| 5 | 16 |
In [5]:
# Filter only SSC row
df %>% filter(Education == 'SSC')
| Education | Football | Hockey | Cricket | Baseball |
|---|---|---|---|---|
| <chr> | <dbl> | <dbl> | <dbl> | <dbl> |
| SSC | 34 | 25 | 20 | 7 |
In [6]:
# Filter SSC and HSC rows
df %>% filter(Education == 'SSC' | Education == 'HSC')
| Education | Football | Hockey | Cricket | Baseball |
|---|---|---|---|---|
| <chr> | <dbl> | <dbl> | <dbl> | <dbl> |
| SSC | 34 | 25 | 20 | 7 |
| HSC | 4 | 18 | 15 | 9 |
In [7]:
# Alternate way of indexing using [row, column], 1:4 indicates 1 to 4
# Example, select first two columns and three rows
df[1:3, 1:2]
| Education | Football |
|---|---|
| <chr> | <dbl> |
| SSC | 34 |
| HSC | 4 |
| Bachelor | 10 |
In [8]:
# Select only Education column (first column), i.e. all acolumns
df[,1]
| Education |
|---|
| <chr> |
| SSC |
| HSC |
| Bachelor |
| Masters |
In [9]:
# Select all columns except first two columns
df[, -c(1, 2)]
| Hockey | Cricket | Baseball |
|---|---|---|
| <dbl> | <dbl> | <dbl> |
| 25 | 20 | 7 |
| 18 | 15 | 9 |
| 19 | 26 | 21 |
| 16 | 23 | 28 |
In [10]:
# Now we will convert the first column as row names to make it a contingency table
# First remove the first column
df[, -1]
| Football | Hockey | Cricket | Baseball |
|---|---|---|---|
| <dbl> | <dbl> | <dbl> | <dbl> |
| 34 | 25 | 20 | 7 |
| 4 | 18 | 15 | 9 |
| 10 | 19 | 26 | 21 |
| 5 | 16 | 23 | 28 |
In [11]:
# Extract the values of the first column
df$Education
- 'SSC'
- 'HSC'
- 'Bachelor'
- 'Masters'
In [29]:
# Use this values as the row names
# Give a name (cont.table) to the contingency table
cont.table = df %>% remove_rownames %>% column_to_rownames(var = 'Education')
cont.table
| Football | Hockey | Cricket | Baseball | |
|---|---|---|---|---|
| <dbl> | <dbl> | <dbl> | <dbl> | |
| SSC | 34 | 25 | 20 | 7 |
| HSC | 4 | 18 | 15 | 9 |
| Bachelor | 10 | 19 | 26 | 21 |
| Masters | 5 | 16 | 23 | 28 |
In [30]:
# We can also bring these row names to column i.e. convert to a dataframe.
# See the changes in the dataframe from 4*4 to 4*5
dataframe = cont.table %>% rownames_to_column(var = 'Education')
dataframe
| Education | Football | Hockey | Cricket | Baseball |
|---|---|---|---|---|
| <chr> | <dbl> | <dbl> | <dbl> | <dbl> |
| SSC | 34 | 25 | 20 | 7 |
| HSC | 4 | 18 | 15 | 9 |
| Bachelor | 10 | 19 | 26 | 21 |
| Masters | 5 | 16 | 23 | 28 |