Start Date: 23 August 2025
Report Date: 31 August 2025
Source: Psi Chi R
knitr::opts_chunk$set(echo = T,message = F,warning = F)
#setwd("C:/Users/alanh/Documents/R/Psi_Chi_R")
rm(list=ls())
setwd("~/R/Psi_Chi_R")
library(tidyverse)
#commenting out code but not for text
if (FALSE) {
}
#total for bottom row
sum_rows = function(x) {
x = as.data.frame(x) # Ensure x is a data frame
sums = sapply(x,function(col) if (is.numeric(col)) sum(col, na.rm = T) else NA)
sums = as.data.frame(t(sums)) # Convert to data frame
names(sums) = names(x) # Retain column names
rbind(x, sums) # Bind the sum row to the original data frame
}
# right column for total
sum_cols = function(x) {
x$Total = rowSums(x[sapply(x, is.numeric)], na.rm = T)
x
}
options(scipen=999) # disable scientific notation
#dollar format function
dollars = function(x) {
paste0("$",format(x,big.mark= ",",scientific=F))
}
desc_stats = function(x){
c(min = min(x,na.rm=T),
median = median(x,na.rm=T),
max = max(x,na.rm=T),
mean = mean(x,na.rm=T),
sd = sd(x,na.rm=T))
}
Clean and EDA
data=read_csv('data.csv',show_col_types = F)
# data = readxl::read_excel('C:/Users/alanh/Downloads/2025Apr_data.xlsx')
names(data) = make.names(colnames(data))
#SmartEDA::ExpData(data,type=2) %>% arrange(desc(Per_of_Missing))
#skimr::skim(data)
Level 1: Preparing the data
Remove participants who are missing values for
‘Age’; Remove participants who are missing values for ‘Hoursdad’
Test your skills: Remove participants with missing
values for ‘Age’ and ‘Hoursdad’
data1 = data |>
filter(!is.na(Age),!is.na(Hoursdad)) |>
arrange(Progress) |>
mutate(Progress = as.character(Progress))
if (FALSE) {
hist(data1$Age,prob=T,col='steelblue')
lines(density(data1$Age),lwd=2,col='darkred')
data1 |>
ggplot(aes(x=Age))+
geom_density()+
theme_bw()
}
Level 2: Data Prep. and inspection
Create a variable called ‘PsyContF’ by summing
together the following variables: DyadF1+ DyadF2+ DyadF3+ DyadF4+
DyadF5+ DyadF6+ DyadF7
dyad_df = data1 |>
mutate(Progress = as.character(Progress)) |>
select(Progress,DyadF1, DyadF2,DyadF3, DyadF4, DyadF5,DyadF6, DyadF7) |>
sum_cols() |>
rename(PsyContF = Total) |>
select(PsyContF,P1=Progress)
data2 = data1 |>
cbind(dyad_df) |>
select(PsyContF,everything())
sum(is.na(data2$PsyContF))
## [1] 0
Test your skills: Visualize outlier values for
‘PsyContF’.
boxplot(data2$PsyContF,main='PsyContF')

data2 |>
ggplot(aes(x=PsyContF))+
geom_boxplot()

summary(data2$PsyContF)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 7.00 9.00 11.57 15.00 35.00
#hist(data2$PsyContF)
Test your skills: If outliers are present, remove
them.
data3 = data2 |>
filter(PsyContF < 25)
boxplot(data3$PsyContF)

Create a variable called FACEcomm (Family
Communication) by adding together the following items FACES43 + FACES44
+ FACES45 + FACES46 + FACES47 + FACES48 + FACES49 + FACES50 + FACES51 +
FACES52.
face_df = data3 |>
select(FACES43, FACES44, FACES45, FACES46 ,FACES47 , FACES48 , FACES49 ,FACES50, FACES51 ,FACES52,Progress) |>
sum_cols() |>
rename(FACEcomm = Total) |>
select(FACEcomm,P2=Progress)
data4 = data3 |>
#left_join(face_df,by=join_by(Progress==P2)) |>
cbind(face_df) |>
select(FACEcomm,PsyContF,P1,P2,everything()) |>
filter(FACEcomm >= 22)
sum(is.na(data4$FACEcomm))
## [1] 0
summary(data4$FACEcomm)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 23.00 33.00 38.00 37.86 42.00 50.00
boxplot(data4$FACEcomm)

Test your skills: Visualize the distribution of the FACEcomm
hist(data4$FACEcomm,probability = T,col='steelblue',main='Distro of Family Communication',xlab = 'FACEcomm')
lines(density(data4$FACEcomm),lwd=2,col='red')

Level 4: Inferential + Other Statistics
Is there a correlation between ‘PsyContF’ and
‘FACEcomm?’ Note any relevant statistics.
Normality test and plot
for (i in level3_list){
x=shapiro.test(data4[[i]])
print(i)
print(x)
}
## [1] "FACEcomm"
##
## Shapiro-Wilk normality test
##
## data: data4[[i]]
## W = 0.97644, p-value = 0.00000115
##
## [1] "PsyContF"
##
## Shapiro-Wilk normality test
##
## data: data4[[i]]
## W = 0.86865, p-value < 0.00000000000000022
for (i in level3_list){
hist(data4[[i]],main=i,xlab=i,probability = T,col='steelblue')
lines(density(data4[[i]]),lwd=2,col='red')
}


#not normally distributed
Correlation
cor.test(data4$PsyContF,data4$FACEcomm,method = 'k')
##
## Kendall's rank correlation tau
##
## data: data4$PsyContF and data4$FACEcomm
## z = -5.2321, p-value = 0.0000001676
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
## tau
## -0.1790498
if (FALSE) {
method_list = list('k','s','p')
for (i in method_list){
x=cor.test(data4$PsyContF,data4$FACEcomm, method = i)
print(x)
}
}
#according to Kendall's tau, there is a statistically significant weak negative correlation of -0.17 between FACEcomm and PsyContF.
Plot
data4 |>
ggplot(aes(x=PsyContF,y=FACEcomm))+
geom_point()+
theme_bw()+
geom_smooth(method = 'lm')+
labs(title='What in the Family Communication and PsyContF is Happening Here?')+
theme(plot.title = element_text(hjust = .5))

---
title: "Psi Chi R - Aug 2025"
#date: "`r Sys.Date()`"
output:
  html_document:
    lightbox: true
    theme: readable
    always_allow_html: yes
    df_print: paged
    toc: yes
    toc_float: yes
    number_sections: no
    anchor_sections: TRUE
    code_folding: hide
    code_download: true
---

Start Date: 23 August 2025

Report Date: `r format(Sys.Date(), '%d %B %Y')`

[Source: Psi Chi R](https://osf.io/gr56s/wiki/home/)

```{r warning=F,message=F}
knitr::opts_chunk$set(echo = T,message = F,warning = F)

#setwd("C:/Users/alanh/Documents/R/Psi_Chi_R")

rm(list=ls())

setwd("~/R/Psi_Chi_R")

library(tidyverse)

#commenting out code but not for text
if (FALSE) {

}

#total for bottom row

sum_rows = function(x) {
  x = as.data.frame(x) # Ensure x is a data frame
  sums = sapply(x,function(col) if (is.numeric(col)) sum(col, na.rm = T) else NA)
  sums = as.data.frame(t(sums)) # Convert to data frame
  names(sums) = names(x) # Retain column names
  rbind(x, sums) # Bind the sum row to the original data frame
}

# right column for total
sum_cols = function(x) {
  x$Total = rowSums(x[sapply(x, is.numeric)], na.rm = T)
  x
}

options(scipen=999) # disable scientific notation

#dollar format function
dollars = function(x) {
  paste0("$",format(x,big.mark= ",",scientific=F))
}

desc_stats = function(x){
  c(min = min(x,na.rm=T),
    median = median(x,na.rm=T),
    max = max(x,na.rm=T),
    mean = mean(x,na.rm=T),
    sd = sd(x,na.rm=T))
}
```

## Clean and EDA

```{r}
data=read_csv('data.csv',show_col_types = F)

# data = readxl::read_excel('C:/Users/alanh/Downloads/2025Apr_data.xlsx')

names(data) = make.names(colnames(data))

#SmartEDA::ExpData(data,type=2) %>% arrange(desc(Per_of_Missing))

#skimr::skim(data)
```

### Data set:

```{r echo=F}
downloadthis::download_this(data,output_name = 'dataset',output_extension = '.xlsx')
```

## Level 1: Preparing the data

### Remove participants who are missing values for ‘Age’; Remove participants who are missing values for ‘Hoursdad’

### Test your skills: Remove participants with missing values for ‘Age’ and ‘Hoursdad’

```{r}
data1 = data |> 
  filter(!is.na(Age),!is.na(Hoursdad)) |> 
  arrange(Progress) |> 
  mutate(Progress = as.character(Progress))

if (FALSE) {
hist(data1$Age,prob=T,col='steelblue')
lines(density(data1$Age),lwd=2,col='darkred')

data1 |>
  ggplot(aes(x=Age))+
  geom_density()+
  theme_bw()
}
```

## Level 2: Data Prep. and inspection

### Create a variable called ‘PsyContF’ by summing together the following variables: DyadF1+ DyadF2+ DyadF3+ DyadF4+ DyadF5+ DyadF6+ DyadF7

```{r}
dyad_df = data1 |> 
  mutate(Progress = as.character(Progress)) |> 
  select(Progress,DyadF1, DyadF2,DyadF3, DyadF4, DyadF5,DyadF6, DyadF7) |> 
  sum_cols() |> 
  rename(PsyContF = Total) |> 
  select(PsyContF,P1=Progress)

data2 = data1 |> 
  cbind(dyad_df) |> 
  select(PsyContF,everything())

sum(is.na(data2$PsyContF))
```

### Test your skills: Visualize outlier values for ‘PsyContF’.

```{r}
boxplot(data2$PsyContF,main='PsyContF')

data2 |> 
  ggplot(aes(x=PsyContF))+
  geom_boxplot()

summary(data2$PsyContF)

#hist(data2$PsyContF)
```

### Test your skills: If outliers are present, remove them.

```{r}
data3 = data2 |> 
  filter(PsyContF < 25)

boxplot(data3$PsyContF)
```

### Create a variable called FACEcomm (Family Communication) by adding together the following items FACES43 + FACES44 + FACES45 + FACES46 + FACES47 + FACES48 + FACES49 + FACES50 + FACES51 + FACES52.

```{r}
face_df = data3 |> 
  select(FACES43, FACES44, FACES45, FACES46 ,FACES47 , FACES48 , FACES49 ,FACES50, FACES51 ,FACES52,Progress) |> 
  sum_cols() |> 
  rename(FACEcomm = Total) |> 
  select(FACEcomm,P2=Progress)

data4 = data3 |> 
  #left_join(face_df,by=join_by(Progress==P2)) |> 
  cbind(face_df) |> 
  select(FACEcomm,PsyContF,P1,P2,everything()) |> 
  filter(FACEcomm >= 22)

sum(is.na(data4$FACEcomm))

summary(data4$FACEcomm)

boxplot(data4$FACEcomm)
```

Test your skills: Visualize the distribution of the FACEcomm

```{r}
hist(data4$FACEcomm,probability = T,col='steelblue',main='Distro of Family Communication',xlab = 'FACEcomm')
lines(density(data4$FACEcomm),lwd=2,col='red')
```

## Level 3: Descriptives

### Find the mean, standard deviation, median, and range of ‘PsyContF’ and ‘FACEcomm’.

### Test your skills: Find the mean, standard deviation, median, and range of ‘PsyContF’ and ‘FACEcomm’ in one step

```{r}
level3_list = c('FACEcomm','PsyContF')

for (i in level3_list){
  x=desc_stats(data4[[i]])
  print(paste("Descriptives for",i))
  print(x)
}

for (i in level3_list){
  x=range(data4[[i]])
  print(paste('Range for',i))
  print(x)
}

```

## Level 4: Inferential + Other Statistics

### Is there a correlation between ‘PsyContF’ and ‘FACEcomm?’ Note any relevant statistics.

#### Normality test and plot

```{r}
for (i in level3_list){
  x=shapiro.test(data4[[i]])
  print(i)
  print(x)
}

for (i in level3_list){
  hist(data4[[i]],main=i,xlab=i,probability = T,col='steelblue')
  lines(density(data4[[i]]),lwd=2,col='red')
}

#not normally distributed
```

#### Correlation

```{r}
cor.test(data4$PsyContF,data4$FACEcomm,method = 'k')

if (FALSE) {
method_list = list('k','s','p')

for (i in method_list){
  x=cor.test(data4$PsyContF,data4$FACEcomm, method = i)
  print(x)
}
}

#according to Kendall's tau, there is a statistically significant weak negative correlation of -0.17 between FACEcomm and PsyContF.
```

#### Plot

```{r}
data4 |> 
  ggplot(aes(x=PsyContF,y=FACEcomm))+
  geom_point()+
  theme_bw()+
  geom_smooth(method = 'lm')+
  labs(title='What in the Family Communication and PsyContF is Happening Here?')+
  theme(plot.title = element_text(hjust = .5))
```


```{r include=F}
#beep when done
if (require("beepr"))
  beepr::beep(2)
```