STUDENT NUMBER - C18357081
STUDENT NAME - SEAN LYNCH
PROGRAMME CODE - TU059
R VERSION - R version 4.2.1
Packages required :
library(psych) # For reverse coding and statistical tests
library(pwr) # For statistical tests
library(corrplot)
library(ggcorrplot)
library(dplyr)
library(tidyverse)
library(stringr)
library(gtools)
library(REdaS)
library(Hmisc)
library(factoextra)#Used for principal component analysis to get a different view of eigenvalues
library(nFactors)
For regression:
needed_packages <- c("foreign", "lm.beta", "stargazer", "car", "ppcor","ggthemes","extrafont")
# Extract not installed packages
not_installed <- needed_packages[!(needed_packages %in% installed.packages()[ , "Package"])]
# Install not installed packages
if(length(not_installed)) install.packages(not_installed)
library(foreign) #To work with SPSS data
library(lm.beta) #Will allow us to isolate the beta co-efficients
library(stargazer)#For formatting outputs/tables
library(ggthemes)
library(extrafont)
library(car)#Levene's test
Begin by preprcoessing data
getwd()
[1] "D:/R Studio/R Projects/PSI_Assignment_2"
#Print version of R
R.version
_
platform x86_64-w64-mingw32
arch x86_64
os mingw32
crt ucrt
system x86_64, mingw32
status
major 4
minor 2.1
year 2022
month 06
day 23
svn rev 82513
language R
version.string R version 4.2.1 (2022-06-23 ucrt)
nickname Funny-Looking Kid
#Assign data set
studentPart2<-read.csv("studentpartII.csv", header = TRUE)
#Find number of features
ncol(studentPart2)
[1] 217
#Find column number of starting dimension
startingNo <- which(colnames(studentPart2)=="A1" )
#Create subset of main data frame used for dimension reduction
dimension_df <- studentPart2[,startingNo:ncol(studentPart2)]
#Remove likert scale answered features that do not answers the 50-item questionnaire
dimension_df <- dimension_df[, -( grep(paste0( "B" ) , colnames(dimension_df),perl = TRUE) ) ]
dimension_df <- dimension_df[, -( grep(paste0( "D" ) , colnames(dimension_df),perl = TRUE) ) ]
dimension_df <- dimension_df[, -( grep(paste0( "F" ) , colnames(dimension_df),perl = TRUE) ) ]
dimension_df <- dimension_df[, -( grep(paste0( "G" ) , colnames(dimension_df),perl = TRUE) ) ]
dimension_df <- dimension_df[, -( grep(paste0( "H" ) , colnames(dimension_df),perl = TRUE) ) ]
dimension_df <- dimension_df[, -( grep(paste0( "I" ) , colnames(dimension_df),perl = TRUE) ) ]
dimension_df <- dimension_df[, -( grep(paste0( "J" ) , colnames(dimension_df),perl = TRUE) ) ]
dimension_df <- dimension_df[, -( grep(paste0( "K" ) , colnames(dimension_df),perl = TRUE) ) ]
dimension_df <- dimension_df[, -( grep(paste0( "L" ) , colnames(dimension_df),perl = TRUE) ) ]
dimension_df <- dimension_df[, -( grep(paste0( "M" ) , colnames(dimension_df),perl = TRUE) ) ]
dimension_df <- dimension_df[, -( grep(paste0( "P" ) , colnames(dimension_df),perl = TRUE) ) ]
#Create vector of negative features
negativeFeatures <- c("A1","A3","A5","A7","C2","C4","C6","C8","E2","E4","E6","E8","E10","N1","N3","N5","N6","N7","N8","N9","N10","O2","O4","O6")
#Create a vector containing -1 values, the same length as the number of negative features
direction <- 1:length(negativeFeatures)
for(i in 1:length(negativeFeatures))
{
direction[i] <- -1
}
#Next reverse code negative questions from survey
subset <- as.data.frame(reverse.code(direction, dimension_df %>% select(negativeFeatures), mini = 1, maxi = 5))
#Merge subset with original dataframe
dimension_df <- dimension_df %>% add_column(subset)
#Drop old, non-reverse coded, negative columns
dimension_df <- dimension_df[,!(names(dimension_df) %in% negativeFeatures)]
#Remove minus symbol from column names
for ( col in 1:ncol(dimension_df)){
colnames(dimension_df)[col] <- sub("-", "", colnames(dimension_df)[col])
}
#Sort numerically and alphabetically
dimension_df <- dimension_df[mixedorder(colnames(dimension_df))]
#Create correlation matrix to look for similarities among variables
corMatrix<-cor(dimension_df[,])
#Too many features to visualise in one matrix, divide into three matrices instead
corMatrix1<-cor(dimension_df[,1:17])
corMatrix2<-cor(dimension_df[,17:34])
corMatrix3<-cor(dimension_df[,34:50])
#Display Matrix 1
ggcorrplot::ggcorrplot(corMatrix1, lab=TRUE, title = "Correlation matrix for personality data", type="lower",lab_size = 5)
#Display Matrix 1
ggcorrplot::ggcorrplot(corMatrix2, lab=TRUE, title = "Correlation matrix for personality data", type="lower",lab_size = 5)
#Display Matrix 1
ggcorrplot::ggcorrplot(corMatrix3, lab=TRUE, title = "Correlation matrix for personality data", type="lower",lab_size = 5)
#Select 30 variable with highest correlations
v <- lapply(apply(corMatrix,1, function(y) which(y > 0.41 & y < 1)),names)
#remove duplicates
v <- unlist(v)
v <- as.vector(v)
d <- v[!duplicated(v)]
#Order columns
d <- mixedsort(d)
#Get count, make sure it's equal to 30
length(d)
[1] 30
#Create 30x30 matrix
corMatrix <- corMatrix[d,d]
#Remove features from dimension table
dimension_df <- dimension_df[,d]
#Display matrix of the 30 selected dimensions
ggcorrplot::ggcorrplot(corMatrix, lab=TRUE, title = "Correlation matrix for personality data", type="lower",lab_size = 4)
##Step 2: Check if data is suitable - look at the relevant Statistics ###Bartlett’s test
#Check if p value is statistically significant
psych::cortest.bartlett(corMatrix, n=nrow(dimension_df))
$chisq
[1] 4606.932
$p.value
[1] 0
$df
[1] 435
###KMO
#Generate relevate statistics using KMO
psych::KMO(dimension_df)
Kaiser-Meyer-Olkin factor adequacy
Call: psych::KMO(r = dimension_df)
Overall MSA = 0.84
MSA for each item =
A1 A2 A3 A5 A6 C2 C3 C4 C5 C6 C7 C8 C9 C10 E2 E3 E4 E5 N1 N3 N5 N6 N7 N8 N9 O2 O7 O8 O9 O10
0.81 0.90 0.77 0.88 0.87 0.87 0.88 0.88 0.85 0.84 0.84 0.86 0.88 0.88 0.83 0.78 0.83 0.81 0.90 0.79 0.84 0.80 0.88 0.85 0.87 0.85 0.72 0.76 0.78 0.79
###Determinant
#Run determinant to check for multicollinearity
det(cor(dimension_df))
[1] 3.935148e-06
##Step 3: Do the Dimension Reduction (PRINCIPAL COMPONENTS ANALYSIS)
#Carry out PCA
pc1 <- principal(dimension_df, nfactors = ncol(dimension_df), rotate = "none")
##Step 4: Decide which components to retain (PRINCIPAL COMPONENTS ANALYSIS)
#Create scree plot to visualise latent variables
plot(pc1$values, type = "b")
#Print the variance explained by each component
pcf=princomp(dimension_df)
factoextra::get_eigenvalue(pcf)
#Visualise variance explained
factoextra::fviz_eig(pcf, addlabels = TRUE, ylim = c(0, 50))
#Print the Eigenvalues
pc1$values
[1] 5.9885666 3.8930302 2.9775921 2.3748522 1.4356323 1.0874134 0.9488770 0.9201761 0.8237031 0.7465672 0.6881600 0.6624180 0.6233264 0.5976029 0.5480529 0.5137708 0.5028589 0.4867853
[19] 0.4820847 0.4274371 0.4120604 0.4028111 0.3810175 0.3584458 0.3266309 0.3213887 0.3019381 0.2871939 0.2566642 0.2229422
#Print the loadings above the level of 0.3
psych::print.psych(pc1, cut = 0.3, sort = TRUE)
Principal Components Analysis
Call: principal(r = dimension_df, nfactors = ncol(dimension_df), rotate = "none")
Standardized loadings (pattern matrix) based upon correlation matrix
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10 PC11 PC12 PC13 PC14 PC15 PC16 PC17 PC18 PC19 PC20 PC21 PC22 PC23 PC24 PC25 PC26 PC27 PC28 PC29 PC30
SS loadings 5.99 3.89 2.98 2.37 1.44 1.09 0.95 0.92 0.82 0.75 0.69 0.66 0.62 0.60 0.55 0.51 0.50 0.49 0.48 0.43 0.41 0.40 0.38 0.36 0.33 0.32 0.30 0.29 0.26 0.22
Proportion Var 0.20 0.13 0.10 0.08 0.05 0.04 0.03 0.03 0.03 0.02 0.02 0.02 0.02 0.02 0.02 0.02 0.02 0.02 0.02 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01
Cumulative Var 0.20 0.33 0.43 0.51 0.56 0.59 0.62 0.65 0.68 0.71 0.73 0.75 0.77 0.79 0.81 0.83 0.84 0.86 0.88 0.89 0.90 0.92 0.93 0.94 0.95 0.96 0.97 0.98 0.99 1.00
Proportion Explained 0.20 0.13 0.10 0.08 0.05 0.04 0.03 0.03 0.03 0.02 0.02 0.02 0.02 0.02 0.02 0.02 0.02 0.02 0.02 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01
Cumulative Proportion 0.20 0.33 0.43 0.51 0.56 0.59 0.62 0.65 0.68 0.71 0.73 0.75 0.77 0.79 0.81 0.83 0.84 0.86 0.88 0.89 0.90 0.92 0.93 0.94 0.95 0.96 0.97 0.98 0.99 1.00
Mean item complexity = 6.6
Test of the hypothesis that 30 components are sufficient.
The root mean square of the residuals (RMSR) is 0
with the empirical chi square 0 with prob < NA
Fit based upon off diagonal values = 1
#create a diagram showing the components and how the manifest variables load
fa.diagram(pc1)
#Show the loadings of variables on to components
fa.sort(pc1$loading)
Loadings:
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10 PC11 PC12 PC13 PC14 PC15 PC16 PC17 PC18 PC19 PC20 PC21 PC22 PC23 PC24 PC25 PC26
C8 -0.598 -0.417 -0.209 0.172 -0.353 0.154 0.111 -0.120 -0.125 0.175 0.108
N1 -0.588 0.404 0.214 0.211 -0.147 -0.118 -0.126 -0.128 0.435 0.105 -0.127 -0.154 -0.127 0.104 -0.108
A6 -0.587 0.331 -0.254 -0.191 0.242 -0.136 -0.147 0.298 0.212 -0.166 -0.243 0.219 0.126
C3 -0.585 -0.348 -0.123 -0.252 -0.149 0.346 0.140 0.154 -0.314 -0.109 0.206 0.124 -0.128
C2 0.577 0.182 0.450 -0.145 0.139 0.134 0.273 0.141 -0.291 -0.127 0.129 -0.195
C10 0.561 0.482 -0.101 0.119 -0.304 -0.149 0.172 -0.206 -0.128 -0.129 0.174 0.252
N9 0.560 -0.193 -0.117 0.105 0.342 -0.281 0.224 -0.327 0.148 0.250 0.178 0.211 0.117 0.127 -0.128
N8 0.557 -0.408 -0.179 0.251 0.203 0.330 -0.196 0.255 0.171 0.118
C7 0.552 0.544 -0.229 -0.244 0.101 0.164 0.153 -0.193 0.126
A5 0.513 -0.307 0.156 -0.355 -0.321 -0.103 0.105 0.185 0.290 0.105 0.168 0.297 0.164 0.182 0.150
C4 0.479 0.103 0.444 -0.262 0.128 -0.269 0.103 0.138 0.220 -0.185 -0.222 0.167 0.337 0.230 -0.142
A2 -0.494 0.549 -0.183 -0.185 -0.174 -0.119 0.164 -0.176 0.217 0.175 0.241 0.265 0.102 0.120 0.123
C9 0.474 0.531 0.146 -0.112 -0.155 -0.103 0.189 0.210 0.165 -0.212 0.111 0.203 0.149 -0.218 0.107 0.306
E3 -0.298 0.515 -0.187 -0.150 0.491 0.133 0.140 0.174 -0.292 0.194 0.119 0.214 0.106 0.148
E5 -0.419 0.506 -0.214 0.353 0.155 0.350 -0.207 0.202 0.106 -0.160 -0.141 -0.164
C6 -0.424 -0.501 0.215 0.143 0.231 0.295 -0.264 -0.111 -0.177 0.130 0.249 0.254 0.165
A3 0.237 -0.493 0.210 0.365 -0.330 0.170 0.195 0.235 -0.269 -0.183 -0.111 -0.113 -0.137 -0.113 -0.178 0.124
A1 0.363 -0.436 0.176 0.386 -0.362 0.167 -0.175 -0.105 0.171 0.201 0.147 0.200 0.120 0.103 0.244 -0.121
N6 -0.189 0.106 0.687 0.257 0.189 -0.114 0.146 0.101 -0.115 0.162 -0.276 0.192 -0.172 0.192 0.137 0.116 -0.143 0.162
N3 -0.270 0.684 0.273 0.218 0.100 0.145 -0.275 0.163 0.178 -0.113 -0.116 -0.266
N5 -0.255 0.610 0.131 -0.127 0.453 0.159 0.350 0.144 -0.269 -0.101 0.142
O8 0.240 0.397 -0.282 0.633 -0.150 0.112 -0.174 0.166 0.126 -0.155 0.215 -0.185 -0.132
O9 0.171 0.455 -0.241 0.564 -0.148 -0.108 0.102 0.218 0.331 -0.194 -0.201 -0.126 -0.119 -0.129 0.117 0.125
O2 0.466 0.183 0.517 0.240 0.160 -0.111 0.118 -0.195 -0.109 -0.157 -0.130 0.364 -0.190 -0.128 0.228 0.110
O7 0.298 -0.308 0.506 0.107 -0.267 0.168 0.315 -0.316 -0.158 -0.186 -0.212 -0.213 0.171 -0.106
O10 0.259 0.385 -0.227 0.465 -0.106 -0.284 0.173 0.140 0.185 -0.335 0.309 -0.129 -0.120 -0.103 0.113 0.150
E2 0.521 -0.283 -0.262 0.545 -0.116 0.176 0.160 0.132 0.120 -0.148 -0.268
E4 0.480 -0.314 -0.282 0.481 -0.107 0.287 0.255 0.173 -0.116 0.257
C5 -0.478 -0.270 -0.176 0.178 0.136 0.519 0.217 0.252 -0.123 0.252 0.141 0.117 -0.200 0.130 0.114 -0.102 -0.110
N7 -0.390 0.193 0.413 0.211 0.229 -0.200 0.104 -0.319 0.470 0.170 -0.156 0.264
PC27 PC28 PC29 PC30
C8 0.107 0.288 0.132
N1 0.140 0.114
A6 0.114
C3 0.142
C2 0.258
C10 -0.157 0.150 0.130
N9 -0.130
N8 0.239
C7 0.204 0.250
A5
C4 -0.110
A2 0.115
C9
E3 0.108
E5 0.113 -0.125
C6 0.192
A3 -0.142 0.148
A1 0.144
N6 -0.126
N3
N5
O8 0.146
O9 -0.116
O2
O7 -0.119 0.119
O10
E2 0.222
E4 -0.149 -0.148
C5
N7
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10 PC11 PC12 PC13 PC14 PC15 PC16 PC17 PC18 PC19 PC20 PC21 PC22 PC23 PC24 PC25 PC26 PC27 PC28
SS loadings 5.989 3.893 2.978 2.375 1.436 1.087 0.949 0.920 0.824 0.747 0.688 0.662 0.623 0.598 0.548 0.514 0.503 0.487 0.482 0.427 0.412 0.403 0.381 0.358 0.327 0.321 0.302 0.287
Proportion Var 0.200 0.130 0.099 0.079 0.048 0.036 0.032 0.031 0.027 0.025 0.023 0.022 0.021 0.020 0.018 0.017 0.017 0.016 0.016 0.014 0.014 0.013 0.013 0.012 0.011 0.011 0.010 0.010
Cumulative Var 0.200 0.329 0.429 0.508 0.556 0.592 0.624 0.654 0.682 0.707 0.729 0.752 0.772 0.792 0.811 0.828 0.844 0.861 0.877 0.891 0.905 0.918 0.931 0.943 0.954 0.964 0.974 0.984
PC29 PC30
SS loadings 0.257 0.223
Proportion Var 0.009 0.007
Cumulative Var 0.993 1.000
#Output the communalities of variables across components (will be one for PCA since all the variance is used)
pc1$communality
A1 A2 A3 A5 A6 C2 C3 C4 C5 C6 C7 C8 C9 C10 E2 E3 E4 E5 N1 N3 N5 N6 N7 N8 N9 O2 O7 O8 O9 O10
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
var <- factoextra::get_pca_var(pcf)
pcf
Call:
princomp(x = dimension_df)
Standard deviations:
Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8 Comp.9 Comp.10 Comp.11 Comp.12 Comp.13 Comp.14 Comp.15 Comp.16 Comp.17 Comp.18
2.8271492 2.2559476 2.0067566 1.8071015 1.3883412 1.1640629 1.1613896 1.0873376 1.0184864 0.9903319 0.9525178 0.9279766 0.9000703 0.8768925 0.8599753 0.8213080 0.8059652 0.7783444
Comp.19 Comp.20 Comp.21 Comp.22 Comp.23 Comp.24 Comp.25 Comp.26 Comp.27 Comp.28 Comp.29 Comp.30
0.7642765 0.7421275 0.7320527 0.7028343 0.6871585 0.6566088 0.6503461 0.6340017 0.5976996 0.5908779 0.5658531 0.5214949
30 variables and 382 observations.
# Contributions of variables to PC1
factoextra::fviz_contrib(pcf, choice = "var", axes = 1, top = 15)
# Contributions of variables to PC2
factoextra::fviz_contrib(pcf, choice = "var", axes = 2, top = 15)
##Step 5: Apply rotation
#Apply rotation to try to refine the component structure
pc2 <- principal(dimension_df, nfactors = 5, rotate = "varimax")#Extracting 4 factors
#output the components
psych::print.psych(pc2, cut = 0.3, sort = TRUE)
Principal Components Analysis
Call: principal(r = dimension_df, nfactors = 5, rotate = "varimax")
Standardized loadings (pattern matrix) based upon correlation matrix
RC1 RC2 RC3 RC4 RC5
SS loadings 4.57 3.49 2.96 2.86 2.79
Proportion Var 0.15 0.12 0.10 0.10 0.09
Cumulative Var 0.15 0.27 0.37 0.46 0.56
Proportion Explained 0.27 0.21 0.18 0.17 0.17
Cumulative Proportion 0.27 0.48 0.66 0.83 1.00
Mean item complexity = 1.5
Test of the hypothesis that 5 components are sufficient.
The root mean square of the residuals (RMSR) is 0.05
with the empirical chi square 854.09 with prob < 8.5e-56
Fit based upon off diagonal values = 0.95
#output the communalities
pc2$communality
A1 A2 A3 A5 A6 C2 C3 C4 C5 C6 C7 C8 C9 C10 E2 E3 E4 E5
0.6319497 0.5820283 0.5857894 0.5080803 0.6141037 0.6089643 0.4901762 0.5226045 0.3817761 0.4971566 0.6641092 0.6104192 0.5654758 0.5701558 0.7192275 0.4174463 0.6437218 0.4828927
N1 N3 N5 N6 N7 N8 N9 O2 O7 O8 O9 O10
0.6008702 0.6662960 0.4562360 0.6210054 0.4577728 0.5728467 0.3763213 0.5754977 0.4523387 0.6961437 0.6131172 0.4851502
#NOTE: you can do all the other things done for the model created in pc1
##Step 6: Reliability Analysis
#test each group as a separate scale
extraversion <- dimension_df[,15:18]
agreeableness <- dimension_df[, 1:5]
neuroticism <- dimension_df[, 19:25]
conscientiousness <- dimension_df[,6:14]
openness <- dimension_df[,26:30]
#Output our Cronbach Alpha values
psych::alpha(extraversion, keys = c(1,-1,1,-1))
psych::alpha(agreeableness, keys = c(1,-1,1,1,-1))
psych::alpha(neuroticism , keys = c(1,1,1,1,1,-1,-1))
psych::alpha(conscientiousness , keys = c(1,-1,1,-1,-1,1,-1,1,1))
psych::alpha(openness)
We want to predict adult mortality rate, begin by testing correlations between adult mortality rate and different predictors
#Create data.frame variable using life expectancy data set
lifeExpectancy <- read.csv("Life Expectancy Plus Geo Region.csv")
##FOR TOTAL EXPENDITURE:
Create subsets containing required variables
#Create population - adult mortality rate data frame
Total.expenditure <- which(colnames(lifeExpectancy) == "Total.expenditure")
mortalityIndex <- which(colnames(lifeExpectancy) == "Adult.Mortality")
#Create subset with needed variables
Totalexpenditurexmortality <- lifeExpectancy[c(1,Total.expenditure,mortalityIndex)]
Remove NA values
Totalexpenditurexmortality <- na.omit(Totalexpenditurexmortality )
#Create sample
txm_sample <- Totalexpenditurexmortality %>% sample_frac(0.33)
Scatter Plot
#Create scatter plot that graphs expected goals against total minutes played, this gives a visual representation of the correlation
scatter <- ggplot(txm_sample, aes(Total.expenditure,Adult.Mortality)) +
geom_point(alpha = 0.7, stroke = 0) +
theme_fivethirtyeight() +
scale_size(range = c(1,8), guide = "none") +
scale_x_log10() +
labs(title = "Total expenditure x mortality rate", x = "Total expenditure size", y = "Adult mortality rate") +
theme(axis.title = element_text(), legend.text = element_text(size=10)) + scale_color_brewer(palette = "Set2") +
geom_point() +
geom_smooth(method = "lm", colour = "Red", se = F)
scatter
#Use pearson test as we are working with continuous and discrete interval data
#Test correlation strength
pearson_results_Q1 <- cor.test(txm_sample$Total.expenditure, txm_sample$Adult.Mortality, method = "pearson")
pearson_results_Q1
Create subsets containing required variables
#Create population - adult mortality rate data frame
percentage.expenditure<- which(colnames(lifeExpectancy) == "percentage.expenditure")
mortalityIndex <- which(colnames(lifeExpectancy) == "Adult.Mortality")
#Create subset with needed variables
percentageexpenditurexmortality <- lifeExpectancy[c(1,percentage.expenditure,mortalityIndex)]
Remove NA values
percentageexpenditurexmortality <- na.omit(percentageexpenditurexmortality )
Scatter Plot
#Create scatter plot that graphs expected goals against total minutes played, this gives a visual representation of the correlation
scatter1 <- ggplot(pxm_sample, aes(percentage.expenditure,Adult.Mortality)) +
geom_point(alpha = 0.7, stroke = 0) +
theme_fivethirtyeight() +
scale_size(range = c(1,8), guide = "none") +
scale_x_log10() +
labs(title = "Total expenditure x mortality rate", x = "Percentage expenditure size", y = "Adult mortality rate") +
theme(axis.title = element_text(), legend.text = element_text(size=10)) + scale_color_brewer(palette = "Set2") +
geom_point() +
geom_smooth(method = "lm", colour = "Red", se = F)
scatter1
#Use pearson test as we are working with continuous and discrete interval data
#Test correlation strength
pearson_results_2 <- cor.test(pxm_sample$percentage.expenditure, pxm_sample$Adult.Mortality, method = "pearson")
pearson_results_2
Create subsets containing required variables
#Create population - adult mortality rate data frame
GDP <- which(colnames(lifeExpectancy) == "GDP")
mortalityIndex <- which(colnames(lifeExpectancy) == "Adult.Mortality")
#Create subset with needed variables
GDPxmortality <- lifeExpectancy[c(1,GDP,mortalityIndex)]
Remove NA values
GDPxmortality <- na.omit(GDPxmortality )
#Create sample
gxm_sample <- GDPxmortality %>% sample_frac(0.33)
Scatter Plot
#Use pearson test as we are working with continuous and discrete interval data
#Test correlation strength
pearson_results_3 <- cor.test(gxm_sample$GDP, gxm_sample$Adult.Mortality, method = "pearson")
pearson_results_3