library(corrplot)
## corrplot 0.95 loaded
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
df = read_excel("DataSets.xlsx", sheet = 'survey', range = 'B50:AJ431')
head(df)
## # A tibble: 6 × 35
## case date location age education family hroof landcultivated credit
## <dbl> <chr> <chr> <dbl> <dbl> <dbl> <chr> <dbl> <chr>
## 1 1 12,09,2018 Kalapara 41 0 5 Tin 0.401 No
## 2 2 12,09,2018 Kalapara 50 10 5 Tin 0.405 No
## 3 3 12,09,2018 Kalapara 38 6 5 Tin 0.972 No
## 4 4 12,09,2018 Kalapara 40 1 6 Tin 2.15 No
## 5 5 12,09,2018 Kalapara 55 4 5 Tin 2.15 No
## 6 6 12,09,2018 Kalapara 55 8 3 Tin 0.405 Yes
## # ℹ 26 more variables: ccheard <chr>, temp <chr>, rain <chr>, cyclone <chr>,
## # ricecultivation <chr>, nonricecultivation <chr>, vegecultivation <chr>,
## # fruitcultivation <chr>, liverearing <chr>, fishculture <chr>,
## # treeplantation <chr>, ricechange <chr>, nonricechange <chr>,
## # vegechange <chr>, fruitchange <chr>, livechange <chr>, fishchange <chr>,
## # treechange <chr>, ccfarmingsystem <chr>, ricefor <chr>, nonricefor <chr>,
## # vegefor <chr>, fruitfor <chr>, livefor <chr>, fishfor <chr>, …
tail(df)
## # A tibble: 6 × 35
## case date location age education family hroof landcultivated credit
## <dbl> <chr> <chr> <dbl> <dbl> <dbl> <chr> <dbl> <chr>
## 1 376 11,10,2018 Teknaf 50 0 8 Leaves… 0.81 No
## 2 377 11,10,2018 Teknaf 35 0 5 Tin 0.162 No
## 3 378 11,10,2018 Teknaf 40 0 6 Leaves… 0.648 No
## 4 379 11,10,2018 Teknaf 38 5 6 Tin 0.04 No
## 5 380 11,10,2018 Teknaf 32 11 14 Tin 0.81 No
## 6 381 11,10,2018 Teknaf 45 0 9 Leaves… 0.105 No
## # ℹ 26 more variables: ccheard <chr>, temp <chr>, rain <chr>, cyclone <chr>,
## # ricecultivation <chr>, nonricecultivation <chr>, vegecultivation <chr>,
## # fruitcultivation <chr>, liverearing <chr>, fishculture <chr>,
## # treeplantation <chr>, ricechange <chr>, nonricechange <chr>,
## # vegechange <chr>, fruitchange <chr>, livechange <chr>, fishchange <chr>,
## # treechange <chr>, ccfarmingsystem <chr>, ricefor <chr>, nonricefor <chr>,
## # vegefor <chr>, fruitfor <chr>, livefor <chr>, fishfor <chr>, …
# check the variable names
names(df)
## [1] "case" "date" "location"
## [4] "age" "education" "family"
## [7] "hroof" "landcultivated" "credit"
## [10] "ccheard" "temp" "rain"
## [13] "cyclone" "ricecultivation" "nonricecultivation"
## [16] "vegecultivation" "fruitcultivation" "liverearing"
## [19] "fishculture" "treeplantation" "ricechange"
## [22] "nonricechange" "vegechange" "fruitchange"
## [25] "livechange" "fishchange" "treechange"
## [28] "ccfarmingsystem" "ricefor" "nonricefor"
## [31] "vegefor" "fruitfor" "livefor"
## [34] "fishfor" "treefor"
# Descriptive statistics
summary(df$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 24.00 36.00 45.00 46.76 57.00 90.00
sd(df$age)
## [1] 13.15927
(se = sd(df$age)/sqrt(length(df$age)))
## [1] 0.67417
# 95% confidence interval [99% CI = mean +/- 2.58*SE]
lower = mean(df$age) - se*1.96
upper = mean(df$age) + se*1.96
paste("The 95% confidence interval for the mean age is between", round(lower, 2), "and", round(upper, 2))
## [1] "The 95% confidence interval for the mean age is between 45.44 and 48.08"
# The age of the respondents ranges from 24 to 60 years with a mean of 46.76 years and standard deviation of 13.16 years. The standard error of the mean is 0.67 years. The 95% confidence interval for the mean age is between 45.44 and 48.08
# Coefficient of variation
cv = sd(df$age)/mean(df$age)*100; cv
## [1] 28.14146
# The coefficient of variation for age is 28.1% which indicates that the age of the respondents is relatively dispersed around the mean.
# Q. We want to know whether this age distribution is associated with their perception of temperature change over the last 5 years (survey year was 2018) compared to the previous 10 years.
str(df$age)
## num [1:381] 41 50 38 40 55 55 40 42 62 65 ...
str(df$temp)
## chr [1:381] "Increased" "Increased" "Increased" "Increased" "Increased" ...
# We will use chi-square test
# This test requires both categorical variables
# Chi-square test is used to understand the association between two categorical variables.
# We will first create a new variable that categorizes the age of the respondents into 3 groups: 24-40, 41-50, and 51-60 years.
df$age_group = cut(df$age,
breaks = c(-Inf, 40, 50, Inf),
labels = c("<40", "41-50", ">50"), right = TRUE)
table(df$age_group)
##
## <40 41-50 >50
## 155 92 134
# Create a contingency table using age_group and temp variable
contTable = table(df$age_group, df$temp)
contTable
##
## Decreased Increased Not sure/unchanged
## <40 2 148 5
## 41-50 1 90 1
## >50 5 126 3
# Chi-square test
result = chisq.test(contTable)
## Warning in chisq.test(contTable): Chi-squared approximation may be incorrect
# Assumptions
# >20% of the cell frequencies must be greater than 5
result
##
## Pearson's Chi-squared test
##
## data: contTable
## X-squared = 3.8457, df = 4, p-value = 0.4273
# p-value > 0.05, sufficient evidence to support the Ho
# H0: Age and temperature perception are independent
# H0: There is no association between age and perception of temperature
# We cannot reject the H0
# We can conclude that the age of the respondents is not associated with their perception of temperature change over the last 5 years compared to the previous 10 years.
# Chi-sqaure estimation was not accurate here, we can use Fisher's exact test
fisher = fisher.test(contTable); fisher
##
## Fisher's Exact Test for Count Data
##
## data: contTable
## p-value = 0.528
## alternative hypothesis: two.sided
# Similar conclusion as chi-square test
# For visualization of the residuals, please see the previous lessons.