library(corrplot)
## corrplot 0.95 loaded
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)

df = read_excel("DataSets.xlsx", sheet = 'survey', range = 'B50:AJ431')

head(df)
## # A tibble: 6 × 35
##    case date       location   age education family hroof landcultivated credit
##   <dbl> <chr>      <chr>    <dbl>     <dbl>  <dbl> <chr>          <dbl> <chr> 
## 1     1 12,09,2018 Kalapara    41         0      5 Tin            0.401 No    
## 2     2 12,09,2018 Kalapara    50        10      5 Tin            0.405 No    
## 3     3 12,09,2018 Kalapara    38         6      5 Tin            0.972 No    
## 4     4 12,09,2018 Kalapara    40         1      6 Tin            2.15  No    
## 5     5 12,09,2018 Kalapara    55         4      5 Tin            2.15  No    
## 6     6 12,09,2018 Kalapara    55         8      3 Tin            0.405 Yes   
## # ℹ 26 more variables: ccheard <chr>, temp <chr>, rain <chr>, cyclone <chr>,
## #   ricecultivation <chr>, nonricecultivation <chr>, vegecultivation <chr>,
## #   fruitcultivation <chr>, liverearing <chr>, fishculture <chr>,
## #   treeplantation <chr>, ricechange <chr>, nonricechange <chr>,
## #   vegechange <chr>, fruitchange <chr>, livechange <chr>, fishchange <chr>,
## #   treechange <chr>, ccfarmingsystem <chr>, ricefor <chr>, nonricefor <chr>,
## #   vegefor <chr>, fruitfor <chr>, livefor <chr>, fishfor <chr>, …
tail(df)
## # A tibble: 6 × 35
##    case date       location   age education family hroof   landcultivated credit
##   <dbl> <chr>      <chr>    <dbl>     <dbl>  <dbl> <chr>            <dbl> <chr> 
## 1   376 11,10,2018 Teknaf      50         0      8 Leaves…          0.81  No    
## 2   377 11,10,2018 Teknaf      35         0      5 Tin              0.162 No    
## 3   378 11,10,2018 Teknaf      40         0      6 Leaves…          0.648 No    
## 4   379 11,10,2018 Teknaf      38         5      6 Tin              0.04  No    
## 5   380 11,10,2018 Teknaf      32        11     14 Tin              0.81  No    
## 6   381 11,10,2018 Teknaf      45         0      9 Leaves…          0.105 No    
## # ℹ 26 more variables: ccheard <chr>, temp <chr>, rain <chr>, cyclone <chr>,
## #   ricecultivation <chr>, nonricecultivation <chr>, vegecultivation <chr>,
## #   fruitcultivation <chr>, liverearing <chr>, fishculture <chr>,
## #   treeplantation <chr>, ricechange <chr>, nonricechange <chr>,
## #   vegechange <chr>, fruitchange <chr>, livechange <chr>, fishchange <chr>,
## #   treechange <chr>, ccfarmingsystem <chr>, ricefor <chr>, nonricefor <chr>,
## #   vegefor <chr>, fruitfor <chr>, livefor <chr>, fishfor <chr>, …
# check the variable names
names(df)
##  [1] "case"               "date"               "location"          
##  [4] "age"                "education"          "family"            
##  [7] "hroof"              "landcultivated"     "credit"            
## [10] "ccheard"            "temp"               "rain"              
## [13] "cyclone"            "ricecultivation"    "nonricecultivation"
## [16] "vegecultivation"    "fruitcultivation"   "liverearing"       
## [19] "fishculture"        "treeplantation"     "ricechange"        
## [22] "nonricechange"      "vegechange"         "fruitchange"       
## [25] "livechange"         "fishchange"         "treechange"        
## [28] "ccfarmingsystem"    "ricefor"            "nonricefor"        
## [31] "vegefor"            "fruitfor"           "livefor"           
## [34] "fishfor"            "treefor"
# Descriptive statistics
summary(df$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   24.00   36.00   45.00   46.76   57.00   90.00
sd(df$age)
## [1] 13.15927
(se = sd(df$age)/sqrt(length(df$age)))
## [1] 0.67417
# 95% confidence interval [99% CI = mean +/- 2.58*SE]
lower = mean(df$age) - se*1.96
upper = mean(df$age) + se*1.96

paste("The 95% confidence interval for the mean age is between", round(lower, 2), "and", round(upper, 2))
## [1] "The 95% confidence interval for the mean age is between 45.44 and 48.08"
# The age of the respondents ranges from 24 to 60 years with a mean of 46.76 years and standard deviation of 13.16 years. The standard error of the mean is 0.67 years. The 95% confidence interval for the mean age is between 45.44 and 48.08


# Coefficient of variation
cv = sd(df$age)/mean(df$age)*100; cv
## [1] 28.14146
# The coefficient of variation for age is 28.1% which indicates that the age of the respondents is relatively dispersed around the mean.

# Q. We want to know whether this age distribution is associated with their perception of temperature change over the last 5 years (survey year was 2018) compared to the previous 10 years.

str(df$age)
##  num [1:381] 41 50 38 40 55 55 40 42 62 65 ...
str(df$temp)
##  chr [1:381] "Increased" "Increased" "Increased" "Increased" "Increased" ...
# We will use chi-square test
# This test requires both categorical variables
# Chi-square test is used to understand the association between two categorical variables.

# We will first create a new variable that categorizes the age of the respondents into 3 groups: 24-40, 41-50, and 51-60 years.

df$age_group = cut(df$age, 
                   breaks = c(-Inf, 40, 50, Inf), 
                   labels = c("<40", "41-50", ">50"), right = TRUE)

table(df$age_group)
## 
##   <40 41-50   >50 
##   155    92   134
# Create a contingency table using age_group and temp variable
contTable = table(df$age_group, df$temp)
contTable
##        
##         Decreased Increased Not sure/unchanged
##   <40           2       148                  5
##   41-50         1        90                  1
##   >50           5       126                  3
# Chi-square test
result = chisq.test(contTable)
## Warning in chisq.test(contTable): Chi-squared approximation may be incorrect
# Assumptions
# >20% of the cell frequencies must be greater than 5

result
## 
##  Pearson's Chi-squared test
## 
## data:  contTable
## X-squared = 3.8457, df = 4, p-value = 0.4273
# p-value > 0.05, sufficient evidence to support the Ho
# H0: Age and temperature perception are independent
# H0: There is no association between age and perception of temperature
# We cannot reject the H0
# We can conclude that the age of the respondents is not associated with their perception of temperature change over the last 5 years compared to the previous 10 years.

# Chi-sqaure estimation was not accurate here, we can use Fisher's exact test
fisher = fisher.test(contTable); fisher
## 
##  Fisher's Exact Test for Count Data
## 
## data:  contTable
## p-value = 0.528
## alternative hypothesis: two.sided
# Similar conclusion as chi-square test

# For visualization of the residuals, please see the previous lessons.