library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
library(dplyr)
wd<-getwd()
ECD<-read_xlsx("ECD.xlsx")
Energy_example<-ECD %>% select(`Total people in household`, `2023 Total Usage`, `2023 Total Usage Cost`, `Annual Household Income`) %>% arrange(-`Total people in household`, `2023 Total Usage`, `2023 Total Usage Cost`, `Annual Household Income`)
Energy_example
## # A tibble: 299 × 4
##    `Total people in household` `2023 Total Usage` `2023 Total Usage Cost`
##                          <dbl>              <dbl>                   <dbl>
##  1                          12              26979                   3066.
##  2                          12              33292                   3866.
##  3                          11              14661                   2177.
##  4                           9              18379                   2362.
##  5                           9              33903                   3963.
##  6                           8              23557                   2695.
##  7                           8              25739                   3007.
##  8                           8              26716                   3242.
##  9                           8              27099                   3441.
## 10                           8              31790                   3599.
## # ℹ 289 more rows
## # ℹ 1 more variable: `Annual Household Income` <dbl>

###2)Select some variables of interest and see if there is any obvious correlations using the COR command###

###THERE ARE POSITIVE CORRELATIONS BETWEEN ALL VARIABLES, HOWEVER THE CORRELATION BETWEEN “2023 TOTAL USAGE”, AND “2023 TOTAL USAGE COST” IS THE HIGHEST CORRELATION WITH A 0.99 CORRELATION MEANING IT IS A VERY STRONG POSITIVE CORRELATION.###

cor(Energy_example)
##                           Total people in household 2023 Total Usage
## Total people in household                1.00000000       0.31104227
## 2023 Total Usage                         0.31104227       1.00000000
## 2023 Total Usage Cost                    0.32861686       0.99035195
## Annual Household Income                  0.06568583       0.08164083
##                           2023 Total Usage Cost Annual Household Income
## Total people in household             0.3286169              0.06568583
## 2023 Total Usage                      0.9903519              0.08164083
## 2023 Total Usage Cost                 1.0000000              0.11036230
## Annual Household Income               0.1103623              1.00000000

###3)Examine the same variables visually using the PAIRS command###

pairs(~`Total people in household`+ `2023 Total Usage`+ `2023 Total Usage Cost`+ `Annual Household Income`, data = ECD)

###4)Select two variables that seem correlated (positively or negatively) and examine them using PEARSON,SPEARMAN or KENDALL (depending on which is more appropriate)###

###RUNNING A HISTOGRAM TO SEE THE DISTRIBUTION OF THE DATA AND DETERMINE ITS NORMALITY###

hist(Energy_example$`2023 Total Usage`,probability = T)
lines(density(Energy_example$`2023 Total Usage`),col="red",lwd=3)

hist(Energy_example$`2023 Total Usage Cost`,probability = T)
lines(density(Energy_example$`2023 Total Usage Cost`),col="red",lwd=3)

###5)Explain your findings and justify your choice in selecting the correlation method###

###HISTOGRAM SHOWS DATA IS NOT COMPLETELY NORMAL AS THE DATA IS SKEWED TO THE RIGHT. GIVEN THAT THE DATA IS NOT NORMAL, KENDALL TEST WOULD BE THE MOST APPROPIATE TEST TO RUN AS IT DOES NOT REQUIRE DATA TO BE NORMAL AND ALSO TAKES INTO ACCOUNT TIES IN THE DATA. BASED ON THE P VALUE OF THE TEST BEING LESS THAN 0.05, IT CAN BE ASSUMED THAT THE CORRELATION BETWEEN “2023 TOTAL USAGE” AND “2023 TOTAL USAGE COST” IS GENERALLY SIGNIFICANT###

cor.test(Energy_example$`2023 Total Usage`, Energy_example$`2023 Total Usage Cost`, method = "kendall")
## 
##  Kendall's rank correlation tau
## 
## data:  Energy_example$`2023 Total Usage` and Energy_example$`2023 Total Usage Cost`
## z = 22.958, p-value < 2.2e-16
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
##       tau 
## 0.8903479

###DUE TO DATA BEING SKEWED TO THE RIGHT, I HAVE DECIDED TO TRANFORM DATA INTO LOG TO SEE IF DATA IS NORMALIZED. THIS WILL ALLOW ME TO RUN A PEARSON TEST, WHICH CAN ONLY BE DONE WHEN DATA IS NORMAL. NOTE: TU=TOTAL USAGE & TUC=TOTAL USAGE COST###

Energy_example_log<-Energy_example %>% mutate(TU_LOG=log(`2023 Total Usage`), TUC_LOG=log(`2023 Total Usage Cost`))
head(Energy_example_log)
## # A tibble: 6 × 6
##   `Total people in household` `2023 Total Usage` `2023 Total Usage Cost`
##                         <dbl>              <dbl>                   <dbl>
## 1                          12              26979                   3066.
## 2                          12              33292                   3866.
## 3                          11              14661                   2177.
## 4                           9              18379                   2362.
## 5                           9              33903                   3963.
## 6                           8              23557                   2695.
## # ℹ 3 more variables: `Annual Household Income` <dbl>, TU_LOG <dbl>,
## #   TUC_LOG <dbl>

###HISTROGRAM OF THE TRANSFORMED DATA INDICATES THAT DATA IS NOW DISTRIBUTED NORMALLY###

hist(Energy_example_log$TU_LOG,probability = T)
lines(density(Energy_example_log$TU_LOG),col="red",lwd=3)

hist(Energy_example_log$TUC_LOG,probability = T)
lines(density(Energy_example_log$TUC_LOG),col="red",lwd=3)

###NOW THAT THE DATA IS NORMAL I CAN NOW RUN A PEARSON TEST###

###PEARSON TEST SHOWS CORRELATION IS SIGNIFICANT AS IT IS LESS THAN 0.05, AND ALSO SHOWS THE CORRELATION TO BE 0.99. THESE FINDINGS SHOW AN INDICATION OF A POSITIVE AND VERY STRONG CORRELATION###

cor.test(Energy_example_log$`2023 Total Usage`, Energy_example_log$`2023 Total Usage Cost`, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  Energy_example_log$`2023 Total Usage` and Energy_example_log$`2023 Total Usage Cost`
## t = 123.16, df = 297, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9878981 0.9923102
## sample estimates:
##       cor 
## 0.9903519