# Lesson 4: Data Manipulation, Pareto Chart and Chi-square Test

# We will need two libraries: corrplot, tidyverse

library(corrplot)
## corrplot 0.95 loaded
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
data = readxl::read_excel('DataSets.xlsx', sheet = 'wrangling', range = 'P7:T11')

# see the data as this is a small dataframe
data
## # A tibble: 4 × 5
##   Education Football Hockey Cricket Baseball
##   <chr>        <dbl>  <dbl>   <dbl>    <dbl>
## 1 SSC             34     25      20        7
## 2 HSC              4     18      15        9
## 3 Bachelor        10     19      26       21
## 4 Masters          5     16      23       28
# Now we will add a new variable named 'Total' and rename the data as data1
# We will use data as the input for the mutate() function connected by %>% 
data1 = data %>% mutate(Total = Football + Hockey + Cricket)
data1
## # A tibble: 4 × 6
##   Education Football Hockey Cricket Baseball Total
##   <chr>        <dbl>  <dbl>   <dbl>    <dbl> <dbl>
## 1 SSC             34     25      20        7    79
## 2 HSC              4     18      15        9    37
## 3 Bachelor        10     19      26       21    55
## 4 Masters          5     16      23       28    44
# We will create another variable named 'cumFreq'
# To show the result while saving to an object use () around the codes
(data2 = data1 %>% mutate(cumFreq = cumsum(Total)))
## # A tibble: 4 × 7
##   Education Football Hockey Cricket Baseball Total cumFreq
##   <chr>        <dbl>  <dbl>   <dbl>    <dbl> <dbl>   <dbl>
## 1 SSC             34     25      20        7    79      79
## 2 HSC              4     18      15        9    37     116
## 3 Bachelor        10     19      26       21    55     171
## 4 Masters          5     16      23       28    44     215
# Pareto chart: Plot this Total and cumFreq in the same graph where Total will be shown as bars and cumFreq as a line (curve).

# Change order of the Education items
data2$Education = factor(data$Education, levels = c('SSC', 'HSC', 'Bachelor', 'Masters'))

ggplot(data2)+
  aes(x = Education, y = Total, fill = Education, color = Education)+
  geom_col()+
  geom_line(aes(y = cumFreq, group = 1), size = 2)+
  scale_y_continuous(name = 'Frequency', 
                     sec.axis = sec_axis(transform = ~., name = 'Cumulative frequency'))+
  geom_text(aes(y = Total+10, label = Total), color = 'blue')+
  geom_text(aes(y = cumFreq + 10, label = cumFreq), color = 'forestgreen')+
  theme_bw()+
  theme(legend.position = 'none')
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Can you change this values to percent?
# Calculate within the dataset, name the new dataset as data3 and use this in the ggplot(data3)
# You can over write the previous variable names or create new variables

(data3 = data2 %>% mutate(Total = Total*100/sum(Total),
                          cumFreq = cumsum(Total)))
## # A tibble: 4 × 7
##   Education Football Hockey Cricket Baseball Total cumFreq
##   <fct>        <dbl>  <dbl>   <dbl>    <dbl> <dbl>   <dbl>
## 1 SSC             34     25      20        7  36.7    36.7
## 2 HSC              4     18      15        9  17.2    54.0
## 3 Bachelor        10     19      26       21  25.6    79.5
## 4 Masters          5     16      23       28  20.5   100
data3 = data3 %>% mutate(Total = round(Total, 2),
                         cumFreq = round(cumFreq, 2))

ggplot(data3)+
  aes(x = Education, y = Total, fill = Education, color = Education)+
  geom_col()+
  geom_line(aes(y = cumFreq, group = 1), size = 2)+
  scale_y_continuous(name = 'Frequency',
                     sec.axis = sec_axis(transform = ~., name = 'Cumulative frequency'))+
  geom_text(aes(y = Total + 10, label = Total), color = 'blue')+
  geom_text(aes(y = cumFreq + 10, label = cumFreq), color = 'forestgreen')+
  theme_bw()+
  labs(title = 'Pareto chart')+
  theme(legend.position = 'none')

# Now go back to the original dataset
data
## # A tibble: 4 × 5
##   Education Football Hockey Cricket Baseball
##   <chr>        <dbl>  <dbl>   <dbl>    <dbl>
## 1 SSC             34     25      20        7
## 2 HSC              4     18      15        9
## 3 Bachelor        10     19      26       21
## 4 Masters          5     16      23       28
# We want to transform this dataset in such a way that this will have two variables: Education and Sport. The Sport variable will stack all the values in three variables, namely Football, Hockey, Cricket and Baseball

(longTable = pivot_longer(data, cols = Football:Baseball, names_to = 'Sport', values_to = 'Frequency'))
## # A tibble: 16 × 3
##    Education Sport    Frequency
##    <chr>     <chr>        <dbl>
##  1 SSC       Football        34
##  2 SSC       Hockey          25
##  3 SSC       Cricket         20
##  4 SSC       Baseball         7
##  5 HSC       Football         4
##  6 HSC       Hockey          18
##  7 HSC       Cricket         15
##  8 HSC       Baseball         9
##  9 Bachelor  Football        10
## 10 Bachelor  Hockey          19
## 11 Bachelor  Cricket         26
## 12 Bachelor  Baseball        21
## 13 Masters   Football         5
## 14 Masters   Hockey          16
## 15 Masters   Cricket         23
## 16 Masters   Baseball        28
# Now we will again create a contingency table using the new dataset
# We will use 'shortTable' as the name of the new dataset

(shortTable = pivot_wider(longTable, names_from = Sport, values_from = Frequency))
## # A tibble: 4 × 5
##   Education Football Hockey Cricket Baseball
##   <chr>        <dbl>  <dbl>   <dbl>    <dbl>
## 1 SSC             34     25      20        7
## 2 HSC              4     18      15        9
## 3 Bachelor        10     19      26       21
## 4 Masters          5     16      23       28
# Now we will perform a chi-square test
# We will use the 'chisq.test()' function
# We will use the 'contTable' dataset as the input
# Contingency table is a table that shows the frequency distribution of two variables
# Here, education is a variable and sports is another variable
# We need to put these education types as row names (not as a variable)

(contTable = shortTable %>% column_to_rownames(var = 'Education'))
##          Football Hockey Cricket Baseball
## SSC            34     25      20        7
## HSC             4     18      15        9
## Bachelor       10     19      26       21
## Masters         5     16      23       28
# Now we will perform the chi-square test and save the result in an object named 'chiSq'
(chiSq = chisq.test(contTable))
## 
##  Pearson's Chi-squared test
## 
## data:  contTable
## X-squared = 50.714, df = 9, p-value = 7.906e-08
# p < 0.05, so we reject the null hypothesis (H0: There is no association between Education and Sports)
# Therefore, we conclude that there is an association between Education and Sports. This means that preference to sports varies with education levels.

# We can see the expected, observed and residual values
chiSq$expected
##           Football   Hockey Cricket Baseball
## SSC      16.278571 23.95714    25.8 19.96429
## HSC       8.707143 12.81429    13.8 10.67857
## Bachelor 14.385714 21.17143    22.8 17.64286
## Masters  13.628571 20.05714    21.6 16.71429
chiSq$observed
##          Football Hockey Cricket Baseball
## SSC            34     25      20        7
## HSC             4     18      15        9
## Bachelor       10     19      26       21
## Masters         5     16      23       28
chiSq$residual
##           Football     Hockey    Cricket   Baseball
## SSC       4.392286  0.2130626 -1.1418739 -2.9014942
## HSC      -1.595216  1.4486430  0.3230291 -0.5136689
## Bachelor -1.156311 -0.4719223  0.6701663  0.7992550
## Masters  -2.337296 -0.9059115  0.3012320  2.7604834
# Residuals = (observed - expected)/sqrt(expected)

# Now, we will visualize the contribution of each cells to the chi-square value using corrplot()
corrplot(chiSq$residual, is.corr = FALSE, tl.col = 'blue', cl.pos = 'b', title = 'Size of the residuals', mar = c(0, 0, 2, 0))

# We can see that the contribution of the cells with bigger circles (blue gradient) is higher than the cells with smaller circles (red gradient) to the chi-square value.
# We can conclude that students at SSC level have a higher preference for Football than expected, while students at SSC level have a lower preference for Baseball than expected.

# We can calculate the contribution of each cell to the chi-square value
# contribution = (residual^2)*100/statistic
# We will use the 'residual' and 'expected' values from the 'chiSq' object
# You can also visualize this using corrplot()

(contribution = (chiSq$residual^2)*100/chiSq$statistic)
##           Football     Hockey   Cricket   Baseball
## SSC      38.041170 0.08951325 2.5710409 16.6003067
## HSC       5.017782 4.13804747 0.2057577  0.5202825
## Bachelor  2.636465 0.43915075 0.8856004  1.2596313
## Masters  10.772095 1.61824485 0.1789266 15.0259861
corrplot(contribution, is.corr = FALSE, tl.col = 'blue', cl.pos = 'b', title = 'Contribution', mar = c(0, 0, 2, 0))

# Let us transform another dataset with the same information
data = readxl::read_excel('DataSets.xlsx', sheet = 'chi_square', range = 'G18:I298')

head(data)
## # A tibble: 6 × 3
##     sr. education sport   
##   <dbl> <chr>     <chr>   
## 1     1 SSC       Football
## 2     2 SSC       Football
## 3     3 SSC       Football
## 4     4 SSC       Football
## 5     5 SSC       Football
## 6     6 SSC       Football
tail(data)
## # A tibble: 6 × 3
##     sr. education sport   
##   <dbl> <chr>     <chr>   
## 1   275 Masters   Baseball
## 2   276 Masters   Baseball
## 3   277 Masters   Baseball
## 4   278 Masters   Baseball
## 5   279 Masters   Baseball
## 6   280 Masters   Baseball
# We will transform this dataset to a contingency table
(conTable = table(data$education, data$sport))
##           
##            Baseball Cricket Football Hockey
##   Bachelor       21      26       10     19
##   HSC             9      15        4     18
##   Masters        28      23        5     16
##   SSC             7      20       34     25
class(conTable)
## [1] "table"
# Convert to a dataframe
conData = as.data.frame(conTable)

class(conData)
## [1] "data.frame"
# Create sr., rename the columns
colnames(conData) = c('education', 'sport', 'frequency')

rownames(conData) = 1:nrow(conData)
conData
##    education    sport frequency
## 1   Bachelor Baseball        21
## 2        HSC Baseball         9
## 3    Masters Baseball        28
## 4        SSC Baseball         7
## 5   Bachelor  Cricket        26
## 6        HSC  Cricket        15
## 7    Masters  Cricket        23
## 8        SSC  Cricket        20
## 9   Bachelor Football        10
## 10       HSC Football         4
## 11   Masters Football         5
## 12       SSC Football        34
## 13  Bachelor   Hockey        19
## 14       HSC   Hockey        18
## 15   Masters   Hockey        16
## 16       SSC   Hockey        25
conData %>% rownames_to_column(var = 'sr.')
##    sr. education    sport frequency
## 1    1  Bachelor Baseball        21
## 2    2       HSC Baseball         9
## 3    3   Masters Baseball        28
## 4    4       SSC Baseball         7
## 5    5  Bachelor  Cricket        26
## 6    6       HSC  Cricket        15
## 7    7   Masters  Cricket        23
## 8    8       SSC  Cricket        20
## 9    9  Bachelor Football        10
## 10  10       HSC Football         4
## 11  11   Masters Football         5
## 12  12       SSC Football        34
## 13  13  Bachelor   Hockey        19
## 14  14       HSC   Hockey        18
## 15  15   Masters   Hockey        16
## 16  16       SSC   Hockey        25