# Lesson 4: Data Manipulation, Pareto Chart and Chi-square Test
# We will need two libraries: corrplot, tidyverse
library(corrplot)
## corrplot 0.95 loaded
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
data = readxl::read_excel('DataSets.xlsx', sheet = 'wrangling', range = 'P7:T11')
# see the data as this is a small dataframe
data
## # A tibble: 4 × 5
## Education Football Hockey Cricket Baseball
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 SSC 34 25 20 7
## 2 HSC 4 18 15 9
## 3 Bachelor 10 19 26 21
## 4 Masters 5 16 23 28
# Now we will add a new variable named 'Total' and rename the data as data1
# We will use data as the input for the mutate() function connected by %>%
data1 = data %>% mutate(Total = Football + Hockey + Cricket)
data1
## # A tibble: 4 × 6
## Education Football Hockey Cricket Baseball Total
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SSC 34 25 20 7 79
## 2 HSC 4 18 15 9 37
## 3 Bachelor 10 19 26 21 55
## 4 Masters 5 16 23 28 44
# We will create another variable named 'cumFreq'
# To show the result while saving to an object use () around the codes
(data2 = data1 %>% mutate(cumFreq = cumsum(Total)))
## # A tibble: 4 × 7
## Education Football Hockey Cricket Baseball Total cumFreq
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SSC 34 25 20 7 79 79
## 2 HSC 4 18 15 9 37 116
## 3 Bachelor 10 19 26 21 55 171
## 4 Masters 5 16 23 28 44 215
# Pareto chart: Plot this Total and cumFreq in the same graph where Total will be shown as bars and cumFreq as a line (curve).
# Change order of the Education items
data2$Education = factor(data$Education, levels = c('SSC', 'HSC', 'Bachelor', 'Masters'))
ggplot(data2)+
aes(x = Education, y = Total, fill = Education, color = Education)+
geom_col()+
geom_line(aes(y = cumFreq, group = 1), size = 2)+
scale_y_continuous(name = 'Frequency',
sec.axis = sec_axis(transform = ~., name = 'Cumulative frequency'))+
geom_text(aes(y = Total+10, label = Total), color = 'blue')+
geom_text(aes(y = cumFreq + 10, label = cumFreq), color = 'forestgreen')+
theme_bw()+
theme(legend.position = 'none')
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Can you change this values to percent?
# Calculate within the dataset, name the new dataset as data3 and use this in the ggplot(data3)
# You can over write the previous variable names or create new variables
(data3 = data2 %>% mutate(Total = Total*100/sum(Total),
cumFreq = cumsum(Total)))
## # A tibble: 4 × 7
## Education Football Hockey Cricket Baseball Total cumFreq
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SSC 34 25 20 7 36.7 36.7
## 2 HSC 4 18 15 9 17.2 54.0
## 3 Bachelor 10 19 26 21 25.6 79.5
## 4 Masters 5 16 23 28 20.5 100
data3 = data3 %>% mutate(Total = round(Total, 2),
cumFreq = round(cumFreq, 2))
ggplot(data3)+
aes(x = Education, y = Total, fill = Education, color = Education)+
geom_col()+
geom_line(aes(y = cumFreq, group = 1), size = 2)+
scale_y_continuous(name = 'Frequency',
sec.axis = sec_axis(transform = ~., name = 'Cumulative frequency'))+
geom_text(aes(y = Total + 10, label = Total), color = 'blue')+
geom_text(aes(y = cumFreq + 10, label = cumFreq), color = 'forestgreen')+
theme_bw()+
labs(title = 'Pareto chart')+
theme(legend.position = 'none')

# Now go back to the original dataset
data
## # A tibble: 4 × 5
## Education Football Hockey Cricket Baseball
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 SSC 34 25 20 7
## 2 HSC 4 18 15 9
## 3 Bachelor 10 19 26 21
## 4 Masters 5 16 23 28
# We want to transform this dataset in such a way that this will have two variables: Education and Sport. The Sport variable will stack all the values in three variables, namely Football, Hockey, Cricket and Baseball
(longTable = pivot_longer(data, cols = Football:Baseball, names_to = 'Sport', values_to = 'Frequency'))
## # A tibble: 16 × 3
## Education Sport Frequency
## <chr> <chr> <dbl>
## 1 SSC Football 34
## 2 SSC Hockey 25
## 3 SSC Cricket 20
## 4 SSC Baseball 7
## 5 HSC Football 4
## 6 HSC Hockey 18
## 7 HSC Cricket 15
## 8 HSC Baseball 9
## 9 Bachelor Football 10
## 10 Bachelor Hockey 19
## 11 Bachelor Cricket 26
## 12 Bachelor Baseball 21
## 13 Masters Football 5
## 14 Masters Hockey 16
## 15 Masters Cricket 23
## 16 Masters Baseball 28
# Now we will again create a contingency table using the new dataset
# We will use 'shortTable' as the name of the new dataset
(shortTable = pivot_wider(longTable, names_from = Sport, values_from = Frequency))
## # A tibble: 4 × 5
## Education Football Hockey Cricket Baseball
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 SSC 34 25 20 7
## 2 HSC 4 18 15 9
## 3 Bachelor 10 19 26 21
## 4 Masters 5 16 23 28
# Now we will perform a chi-square test
# We will use the 'chisq.test()' function
# We will use the 'contTable' dataset as the input
# Contingency table is a table that shows the frequency distribution of two variables
# Here, education is a variable and sports is another variable
# We need to put these education types as row names (not as a variable)
(contTable = shortTable %>% column_to_rownames(var = 'Education'))
## Football Hockey Cricket Baseball
## SSC 34 25 20 7
## HSC 4 18 15 9
## Bachelor 10 19 26 21
## Masters 5 16 23 28
# Now we will perform the chi-square test and save the result in an object named 'chiSq'
(chiSq = chisq.test(contTable))
##
## Pearson's Chi-squared test
##
## data: contTable
## X-squared = 50.714, df = 9, p-value = 7.906e-08
# p < 0.05, so we reject the null hypothesis (H0: There is no association between Education and Sports)
# Therefore, we conclude that there is an association between Education and Sports. This means that preference to sports varies with education levels.
# We can see the expected, observed and residual values
chiSq$expected
## Football Hockey Cricket Baseball
## SSC 16.278571 23.95714 25.8 19.96429
## HSC 8.707143 12.81429 13.8 10.67857
## Bachelor 14.385714 21.17143 22.8 17.64286
## Masters 13.628571 20.05714 21.6 16.71429
chiSq$observed
## Football Hockey Cricket Baseball
## SSC 34 25 20 7
## HSC 4 18 15 9
## Bachelor 10 19 26 21
## Masters 5 16 23 28
chiSq$residual
## Football Hockey Cricket Baseball
## SSC 4.392286 0.2130626 -1.1418739 -2.9014942
## HSC -1.595216 1.4486430 0.3230291 -0.5136689
## Bachelor -1.156311 -0.4719223 0.6701663 0.7992550
## Masters -2.337296 -0.9059115 0.3012320 2.7604834
# Residuals = (observed - expected)/sqrt(expected)
# Now, we will visualize the contribution of each cells to the chi-square value using corrplot()
corrplot(chiSq$residual, is.corr = FALSE, tl.col = 'blue', cl.pos = 'b', title = 'Size of the residuals', mar = c(0, 0, 2, 0))

# We can see that the contribution of the cells with bigger circles (blue gradient) is higher than the cells with smaller circles (red gradient) to the chi-square value.
# We can conclude that students at SSC level have a higher preference for Football than expected, while students at SSC level have a lower preference for Baseball than expected.
# We can calculate the contribution of each cell to the chi-square value
# contribution = (residual^2)*100/statistic
# We will use the 'residual' and 'expected' values from the 'chiSq' object
# You can also visualize this using corrplot()
(contribution = (chiSq$residual^2)*100/chiSq$statistic)
## Football Hockey Cricket Baseball
## SSC 38.041170 0.08951325 2.5710409 16.6003067
## HSC 5.017782 4.13804747 0.2057577 0.5202825
## Bachelor 2.636465 0.43915075 0.8856004 1.2596313
## Masters 10.772095 1.61824485 0.1789266 15.0259861
corrplot(contribution, is.corr = FALSE, tl.col = 'blue', cl.pos = 'b', title = 'Contribution', mar = c(0, 0, 2, 0))

# Let us transform another dataset with the same information
data = readxl::read_excel('DataSets.xlsx', sheet = 'chi_square', range = 'G18:I298')
head(data)
## # A tibble: 6 × 3
## sr. education sport
## <dbl> <chr> <chr>
## 1 1 SSC Football
## 2 2 SSC Football
## 3 3 SSC Football
## 4 4 SSC Football
## 5 5 SSC Football
## 6 6 SSC Football
tail(data)
## # A tibble: 6 × 3
## sr. education sport
## <dbl> <chr> <chr>
## 1 275 Masters Baseball
## 2 276 Masters Baseball
## 3 277 Masters Baseball
## 4 278 Masters Baseball
## 5 279 Masters Baseball
## 6 280 Masters Baseball
# We will transform this dataset to a contingency table
(conTable = table(data$education, data$sport))
##
## Baseball Cricket Football Hockey
## Bachelor 21 26 10 19
## HSC 9 15 4 18
## Masters 28 23 5 16
## SSC 7 20 34 25
class(conTable)
## [1] "table"
# Convert to a dataframe
conData = as.data.frame(conTable)
class(conData)
## [1] "data.frame"
# Create sr., rename the columns
colnames(conData) = c('education', 'sport', 'frequency')
rownames(conData) = 1:nrow(conData)
conData
## education sport frequency
## 1 Bachelor Baseball 21
## 2 HSC Baseball 9
## 3 Masters Baseball 28
## 4 SSC Baseball 7
## 5 Bachelor Cricket 26
## 6 HSC Cricket 15
## 7 Masters Cricket 23
## 8 SSC Cricket 20
## 9 Bachelor Football 10
## 10 HSC Football 4
## 11 Masters Football 5
## 12 SSC Football 34
## 13 Bachelor Hockey 19
## 14 HSC Hockey 18
## 15 Masters Hockey 16
## 16 SSC Hockey 25
conData %>% rownames_to_column(var = 'sr.')
## sr. education sport frequency
## 1 1 Bachelor Baseball 21
## 2 2 HSC Baseball 9
## 3 3 Masters Baseball 28
## 4 4 SSC Baseball 7
## 5 5 Bachelor Cricket 26
## 6 6 HSC Cricket 15
## 7 7 Masters Cricket 23
## 8 8 SSC Cricket 20
## 9 9 Bachelor Football 10
## 10 10 HSC Football 4
## 11 11 Masters Football 5
## 12 12 SSC Football 34
## 13 13 Bachelor Hockey 19
## 14 14 HSC Hockey 18
## 15 15 Masters Hockey 16
## 16 16 SSC Hockey 25