Basics

Jeong Eun Cheon

cheonje@yonsei.ac.kr

Basics
Preparing for the analysis
- Read in data
Preliminary
Regression

Basics

Script file: write & edit R scripts
Environment: list of all the R objects loaded in the workspace
Console: R coded is executed here
Plots: for graphical outputs

Preparing for the analysis

R has different data structures

data frame
matrix
list
vectors and others…

R has different data types

numeric
integer
character
logical
factor
complex
raw
others: NA (mssing), NaN (undefined mathematical operations), Inf.

data structures can be classified based on their dimensionality (1D, 2D, or ND) and whether they are homogeneous (all elements of the same type) or heterogeneous (elements can be of different types).

Read in data

data<- read.csv(file.choose()) data<- read.spss(file.choose()) data<- data.frame(read.spss(file.choose()))

#install.packages("foreign")

library(foreign)

## Warning: package 'foreign' was built under R version 4.3.2

#install.packages("tidyverse")
library(tidyverse)

## Warning: package 'ggplot2' was built under R version 4.3.2

data <- read_csv("Practice.csv")

library(haven)
data <- read_sav('C:/Users/user/Downloads/Practice.sav')
data <- read_sav('Practice.sav')

head(data)

## # A tibble: 6 × 7
##   gender    comm_freq criticism1 criticism2 criticism3 rel_sat criticism
##   <dbl+lbl>     <dbl>      <dbl>      <dbl>      <dbl>   <dbl>     <dbl>
## 1 2 [Women]         7       2.80       3.54       3.15    8.19      3.16
## 2 1 [Men]           7       4.35       4.37       6.09    6.69      4.94
## 3 1 [Men]           3       3.51       3.33       3.46    4.73      3.44
## 4 2 [Women]         6       3.49       2.94       3.88    6.34      3.44
## 5 2 [Women]         3       3.03       2.15       2.30    5.49      2.50
## 6 1 [Men]           2       2.44       3.49       2.11    5.75      2.68

tail(data)

## # A tibble: 6 × 7
##   gender    comm_freq criticism1 criticism2 criticism3 rel_sat criticism
##   <dbl+lbl>     <dbl>      <dbl>      <dbl>      <dbl>   <dbl>     <dbl>
## 1 1 [Men]           3       3.31       3.60       3.80    4.01      3.57
## 2 1 [Men]           1       3.80       4.15       4.00    3.79      3.98
## 3 1 [Men]           7       4.29       3.24       5.89    6.70      4.47
## 4 1 [Men]           6       3.66       3.49       5.68    5.57      4.28
## 5 2 [Women]         5       5.50       4.61       4.04    7.07      4.72
## 6 2 [Women]         5       3.87       3.98       4.11    5.57      3.99

library(dplyr)
glimpse(data)

## Rows: 200
## Columns: 7
## $ gender     <dbl+lbl> 2, 1, 1, 2, 2, 1, 2, 2, 2, 2, 2, 1, 1, 2, 1, 2, 1, 1, 1…
## $ comm_freq  <dbl> 7, 7, 3, 6, 3, 2, 2, 6, 3, 5, 4, 6, 6, 1, 2, 3, 5, 3, 3, 1,…
## $ criticism1 <dbl> 2.800655, 4.354122, 3.512366, 3.492076, 3.031426, 2.437645,…
## $ criticism2 <dbl> 3.544965, 4.374847, 3.332238, 2.939193, 2.152264, 3.488589,…
## $ criticism3 <dbl> 3.145929, 6.086450, 3.461761, 3.884228, 2.302969, 2.111063,…
## $ rel_sat    <dbl> 8.188753, 6.687051, 4.727543, 6.338100, 5.485590, 5.749247,…
## $ criticism  <dbl> 3.163850, 4.938473, 3.435455, 3.438499, 2.495553, 2.679099,…

Preliminary

library(psych)
alpha(data[,c("criticism1","criticism2", "criticism3")])

## 
## Reliability analysis   
## Call: alpha(x = data[, c("criticism1", "criticism2", "criticism3")])
## 
##   raw_alpha std.alpha G6(smc) average_r S/N  ase mean sd median_r
##       0.91      0.92    0.88      0.78  11 0.01    4  1     0.78
## 
##     95% confidence boundaries 
##          lower alpha upper
## Feldt     0.89  0.91  0.93
## Duhachek  0.89  0.91  0.94
## 
##  Reliability if an item is dropped:
##            raw_alpha std.alpha G6(smc) average_r S/N alpha se var.r med.r
## criticism1      0.87      0.87    0.77      0.77 6.9    0.018    NA  0.77
## criticism2      0.87      0.87    0.78      0.78 6.9    0.018    NA  0.78
## criticism3      0.89      0.89    0.80      0.80 7.9    0.016    NA  0.80
## 
##  Item statistics 
##              n raw.r std.r r.cor r.drop mean  sd
## criticism1 200  0.92  0.93  0.87   0.84    4 1.1
## criticism2 200  0.93  0.93  0.87   0.83    4 1.1
## criticism3 200  0.92  0.92  0.85   0.82    4 1.2

data$criticism <- rowMeans(data[,c("criticism1","criticism2", "criticism3")], na.rm=TRUE)


library(qwraps2)
mean_sd(data$criticism)

## [1] "4.03 $\\pm$ 1.04"

mean_sd(data$rel_sat)

## [1] "5.61 $\\pm$ 1.67"

mean_sd(data$comm_freq)

## [1] "4.23 $\\pm$ 2.05"

Regression

lm(rel_sat ~ comm_freq + criticism, data)

## 
## Call:
## lm(formula = rel_sat ~ comm_freq + criticism, data = data)
## 
## Coefficients:
## (Intercept)    comm_freq    criticism  
##     3.21751      0.61102     -0.04782

summary(lm(rel_sat ~ comm_freq + criticism, data))

## 
## Call:
## lm(formula = rel_sat ~ comm_freq + criticism, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.9942 -0.7790 -0.0983  0.8457  3.1609 
## 
## Coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)  3.21751    0.34828   9.238 <0.0000000000000002 ***
## comm_freq    0.61102    0.03881  15.744 <0.0000000000000002 ***
## criticism   -0.04782    0.07622  -0.627               0.531    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.12 on 197 degrees of freedom
## Multiple R-squared:  0.5573, Adjusted R-squared:  0.5528 
## F-statistic:   124 on 2 and 197 DF,  p-value: < 0.00000000000000022

a <- lm(rel_sat ~ comm_freq + criticism, data)
summary(a)

## 
## Call:
## lm(formula = rel_sat ~ comm_freq + criticism, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.9942 -0.7790 -0.0983  0.8457  3.1609 
## 
## Coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)  3.21751    0.34828   9.238 <0.0000000000000002 ***
## comm_freq    0.61102    0.03881  15.744 <0.0000000000000002 ***
## criticism   -0.04782    0.07622  -0.627               0.531    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.12 on 197 degrees of freedom
## Multiple R-squared:  0.5573, Adjusted R-squared:  0.5528 
## F-statistic:   124 on 2 and 197 DF,  p-value: < 0.00000000000000022

Regression Model Diagnostics

Assumptions

Linearity (the expected value of the dependent variable can be expressed as a linear combination of the parameters (regression coefficients) times the independent variables)
Normality of residuals (multivariate normality)
Homogeneity of residuals
Independence of residuals/observations
No multicollinearity

par(mfrow = c(2, 2))
plot(a)

Residuals vs Fitted: This plot is essential for checking the assumption of linearity in your regression model. It plots the residuals (differences between observed and predicted values) on the y-axis against the fitted values (predicted values) on the x-axis. Ideally, you want to see a random scatter of points around a horizontal line at zero without any discernible patterns. A random dispersion indicates that the model’s predictions are unbiased at all levels of the predictor variable, affirming linearity. However, systematic patterns or a non-horizontal trend suggest a violation of the linearity assumption, indicating that the model may not have captured some aspect of the data’s structure.
Normal Q-Q: A Quantile-Quantile (Q-Q) plot is a graphical tool to evaluate if a dataset follows a certain theoretical distribution, such as a normal distribution. The Q-Q plot does this by comparing the quantiles of the sample data against the quantiles of the theoretical distribution. If the data follow the theoretical distribution closely, the points on the Q-Q plot will lie approximately along a straight line. The most common use of the QQ plot is to check the normality of a distribution. In an ideal scenario, the points should fall approximately along the reference line (a dashed line on the plot), indicating that the residuals are normally distributed. Deviations from this line suggest deviations from normality, such as skewness or heavy tails, which could affect hypothesis testing.
Scale-Location (or Spread-Location): This plot is designed to check for homoscedasticity, meaning that the residuals have constant variance across all levels of the predictors. It plots the square root of the absolute residuals against the fitted values. When we square the residuals (or use absolute values followed by a square root), small residuals become smaller and large residuals become larger in terms of their relative size. This operation magnifies the impact of changes in variance. In a plot of squared residuals (or absolute value squared), areas of higher variance are more visually pronounced than in a plot of the residuals themselves. This makes it easier to see if the variance is changing as a function of the predicted values, which is the definition of homoscedasticity (constant variance) versus heteroscedasticity (changing variance). Like with the Residuals vs Fitted plot, you are looking for a random scatter of points, this time focusing on the spread of points to be even across the range of fitted values. A plot exhibiting a horizontal trend with a similar spread of residuals across the full range of fitted values suggests homoscedasticity. Conversely, a pattern where the spread increases or decreases with the fitted values indicates heteroscedasticity, a potential violation of regression assumptions.
Residuals vs Leverage: The purpose of the Residuals vs Leverage plot is to identify influential observations—data points that have a substantial impact on the estimation of the regression coefficients. These points are not necessarily outliers in the y-direction (residuals) but have leverage due to their x-values. The plot helps in detecting these points by showing both the residuals and the leverage of each observation. Observations that are far from the majority of the data in the x-direction will appear towards the right of the plot. The Cook’s distance contours shown in the plot help identify points that are influential across all predictors. Leverage is a measure of how far an individual data point’s value of the independent variable is from the mean of all the values of the independent variable. Points that have extreme values on the independent variable(s) have high leverage because they can have a large effect on the slope of the regression line. Essentially, leverage is about the “location” of the independent variable values, not about the error or residual. We are interested in points that are outliers in the y-direction (large residuals) and have high leverage (extreme x-values). These points are influential because they can change the regression line significantly. If these points also fall outside of certain Cook’s distance thresholds, it indicates they are influential across the regression model and can potentially distort the analysis.

#multivariate normality
shapiro.test(resid(a))

## 
##  Shapiro-Wilk normality test
## 
## data:  resid(a)
## W = 0.99057, p-value = 0.2158

#fail to reject: no evidence against the normality of the residuals from your model
#yet the Shapiro-Wilk test can be sensitive to large sample sizes, often rejecting the normality of the residuals for even small deviations from normality that have little practical significance.
#Look at the Q-Q plot of the residuals to visually assess normality. Sometimes, the visual inspection might suggest that the departure from normality is not severe.
#Other options: transform the variable, remove outlier, bootstrap ... etc.

#multicollinearity
library(car)
car::vif(a)

## comm_freq criticism 
##  1.003684  1.003684

library(olsrr)

## Warning: package 'olsrr' was built under R version 4.3.3

ols_plot_resid_qq(a)

ols_test_normality(a)

## -----------------------------------------------
##        Test             Statistic       pvalue  
## -----------------------------------------------
## Shapiro-Wilk              0.9906         0.2158 
## Kolmogorov-Smirnov        0.0596         0.4757 
## Cramer-von Mises         14.9497         0.0000 
## Anderson-Darling          0.8033         0.0370 
## -----------------------------------------------

#homoskedasticity
library(lmtest)
bptest(a)

## 
##  studentized Breusch-Pagan test
## 
## data:  a
## BP = 14.023, df = 2, p-value = 0.0009016

#outliers
plot (a, which = 1:5)

filtered <- data[-c(78, 138, 179), ]  #give me all columns of data but exclude rows 78, 138, and 179 (Removes rows 78, 138, and 179)

Correlation

library(psych)
corr.test(data[c("comm_freq", "criticism", "rel_sat")])

## Call:corr.test(x = data[c("comm_freq", "criticism", "rel_sat")])
## Correlation matrix 
##           comm_freq criticism rel_sat
## comm_freq      1.00      0.06    0.75
## criticism      0.06      1.00    0.02
## rel_sat        0.75      0.02    1.00
## Sample Size 
## [1] 200
## Probability values (Entries above the diagonal are adjusted for multiple tests.) 
##           comm_freq criticism rel_sat
## comm_freq      0.00      0.79    0.00
## criticism      0.39      0.00    0.83
## rel_sat        0.00      0.83    0.00
## 
##  To see confidence intervals of the correlations, print with the short=FALSE option

library(metan)
plot(corr_coef(data[c("comm_freq", "criticism", "rel_sat")]))

print(corr_coef(data[c("comm_freq", "criticism", "rel_sat")]))

## ---------------------------------------------------------------------------
## Pearson's correlation coefficient
## ---------------------------------------------------------------------------
##           comm_freq criticism rel_sat
## comm_freq    1.0000    0.0606  0.7459
## criticism    0.0606    1.0000  0.0155
## rel_sat      0.7459    0.0155  1.0000
## ---------------------------------------------------------------------------
## p-values for the correlation coefficients
## ---------------------------------------------------------------------------
##                                           comm_freq criticism
## comm_freq 0.000000000000000000000000000000000000000     0.394
## criticism 0.394072582828221884554409371048677712679     0.000
## rel_sat   0.000000000000000000000000000000000000855     0.827
##                                             rel_sat
## comm_freq 0.000000000000000000000000000000000000855
## criticism 0.827462910778318794235985933482879772782
## rel_sat   0.000000000000000000000000000000000000000

Visualization

library("ggpubr")
ggscatter(data, x = "criticism", y = "rel_sat", 
          add = "reg.line", conf.int = TRUE, 
          xlab = "Criticism", ylab = "Relationship Satisfaction")