LOAD LIBRARIES

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(DHARMa)
## This is DHARMa 0.4.7. For overview type '?DHARMa'. For recent changes, type news(package = 'DHARMa')
library(performance)
## Warning: package 'performance' was built under R version 4.5.2
library(nlme)
## 
## Attaching package: 'nlme'
## 
## The following object is masked from 'package:dplyr':
## 
##     collapse
library(lme4)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## 
## Attaching package: 'lme4'
## 
## The following object is masked from 'package:nlme':
## 
##     lmList
library(mgcv)         
## This is mgcv 1.9-3. For overview type 'help("mgcv-package")'.
library(gamlss)         
## Loading required package: splines
## Loading required package: gamlss.data
## 
## Attaching package: 'gamlss.data'
## 
## The following object is masked from 'package:datasets':
## 
##     sleep
## 
## Loading required package: gamlss.dist
## Loading required package: parallel
##  **********   GAMLSS Version 5.5-0  ********** 
## For more on GAMLSS look at https://www.gamlss.com/
## Type gamlssNews() to see new features/changes/bug fixes.
## 
## 
## Attaching package: 'gamlss'
## 
## The following object is masked from 'package:mgcv':
## 
##     lp
## 
## The following object is masked from 'package:lme4':
## 
##     refit
## 
## The following object is masked from 'package:DHARMa':
## 
##     getQuantile
library(quantreg)     
## Loading required package: SparseM
## 
## Attaching package: 'SparseM'
## 
## The following object is masked from 'package:Matrix':
## 
##     det
library(corrplot)   
## corrplot 0.95 loaded

FIRST LOOK

df <- read.csv("Autism.csv") %>% as_tibble()
dim(df)       # rows and columns
## [1] 704  21
names(df)     # column names
##  [1] "A1_Score"        "A2_Score"        "A3_Score"        "A4_Score"       
##  [5] "A5_Score"        "A6_Score"        "A7_Score"        "A8_Score"       
##  [9] "A9_Score"        "A10_Score"       "age"             "gender"         
## [13] "ethnicity"       "jundice"         "austim"          "contry_of_res"  
## [17] "used_app_before" "result"          "age_desc"        "relation"       
## [21] "Class.ASD"
head(df)      # first 6 rows
## # A tibble: 6 × 21
##   A1_Score A2_Score A3_Score A4_Score A5_Score A6_Score A7_Score A8_Score
##      <int>    <int>    <int>    <int>    <int>    <int>    <int>    <int>
## 1        1        1        1        1        0        0        1        1
## 2        1        1        0        1        0        0        0        1
## 3        1        1        0        1        1        0        1        1
## 4        1        1        0        1        0        0        1        1
## 5        1        0        0        0        0        0        0        1
## 6        1        1        1        1        1        0        1        1
## # ℹ 13 more variables: A9_Score <int>, A10_Score <int>, age <chr>,
## #   gender <chr>, ethnicity <chr>, jundice <chr>, austim <chr>,
## #   contry_of_res <chr>, used_app_before <chr>, result <int>, age_desc <chr>,
## #   relation <chr>, Class.ASD <chr>
str(df)       # data types - VERY important
## tibble [704 × 21] (S3: tbl_df/tbl/data.frame)
##  $ A1_Score       : int [1:704] 1 1 1 1 1 1 0 1 1 1 ...
##  $ A2_Score       : int [1:704] 1 1 1 1 0 1 1 1 1 1 ...
##  $ A3_Score       : int [1:704] 1 0 0 0 0 1 0 1 0 1 ...
##  $ A4_Score       : int [1:704] 1 1 1 1 0 1 0 1 0 1 ...
##  $ A5_Score       : int [1:704] 0 0 1 0 0 1 0 0 1 0 ...
##  $ A6_Score       : int [1:704] 0 0 0 0 0 0 0 0 0 1 ...
##  $ A7_Score       : int [1:704] 1 0 1 1 0 1 0 0 0 1 ...
##  $ A8_Score       : int [1:704] 1 1 1 1 1 1 1 0 1 1 ...
##  $ A9_Score       : int [1:704] 0 0 1 0 0 1 0 1 1 1 ...
##  $ A10_Score      : int [1:704] 0 1 1 1 0 1 0 0 1 0 ...
##  $ age            : chr [1:704] "26" "24" "27" "35" ...
##  $ gender         : chr [1:704] "f" "m" "m" "f" ...
##  $ ethnicity      : chr [1:704] "White-European" "Latino" "Latino" "White-European" ...
##  $ jundice        : chr [1:704] "no" "no" "yes" "no" ...
##  $ austim         : chr [1:704] "no" "yes" "yes" "yes" ...
##  $ contry_of_res  : chr [1:704] "'United States'" "Brazil" "Spain" "'United States'" ...
##  $ used_app_before: chr [1:704] "no" "no" "no" "no" ...
##  $ result         : int [1:704] 6 5 8 6 2 9 2 5 6 8 ...
##  $ age_desc       : chr [1:704] "'18 and more'" "'18 and more'" "'18 and more'" "'18 and more'" ...
##  $ relation       : chr [1:704] "Self" "Self" "Parent" "Self" ...
##  $ Class.ASD      : chr [1:704] "NO" "NO" "YES" "NO" ...

CLEAN DATA

df[df == "?"] <- NA
colSums(is.na(df))          # check where NAs are
##        A1_Score        A2_Score        A3_Score        A4_Score        A5_Score 
##               0               0               0               0               0 
##        A6_Score        A7_Score        A8_Score        A9_Score       A10_Score 
##               0               0               0               0               0 
##             age          gender       ethnicity         jundice          austim 
##               2               0              95               0               0 
##   contry_of_res used_app_before          result        age_desc        relation 
##               0               0               0               0              95 
##       Class.ASD 
##               0
colMeans(is.na(df)) * 100   # percentage missing per column
##        A1_Score        A2_Score        A3_Score        A4_Score        A5_Score 
##       0.0000000       0.0000000       0.0000000       0.0000000       0.0000000 
##        A6_Score        A7_Score        A8_Score        A9_Score       A10_Score 
##       0.0000000       0.0000000       0.0000000       0.0000000       0.0000000 
##             age          gender       ethnicity         jundice          austim 
##       0.2840909       0.0000000      13.4943182       0.0000000       0.0000000 
##   contry_of_res used_app_before          result        age_desc        relation 
##       0.0000000       0.0000000       0.0000000       0.0000000      13.4943182 
##       Class.ASD 
##       0.0000000
df <- df %>% drop_na()
df$asd <- ifelse(df$Class.ASD == "YES", 1, 0)
df <- df %>%
  mutate(
    across(A1_Score:A10_Score, as.integer),
    age    = as.numeric(age),
    result = as.numeric(result),
    gender          = factor(gender),
    ethnicity       = factor(ethnicity),
    jundice         = factor(jundice),
    austim          = factor(austim),
    used_app_before = factor(used_app_before),
    relation        = factor(relation),
    Class.ASD       = factor(Class.ASD, levels = c("NO", "YES"))
  )

str(df)
## tibble [609 × 22] (S3: tbl_df/tbl/data.frame)
##  $ A1_Score       : int [1:609] 1 1 1 1 1 0 1 1 1 1 ...
##  $ A2_Score       : int [1:609] 1 1 1 1 1 1 1 1 1 1 ...
##  $ A3_Score       : int [1:609] 1 0 0 0 1 0 1 0 1 1 ...
##  $ A4_Score       : int [1:609] 1 1 1 1 1 0 1 0 1 1 ...
##  $ A5_Score       : int [1:609] 0 0 1 0 1 0 0 1 0 1 ...
##  $ A6_Score       : int [1:609] 0 0 0 0 0 0 0 0 1 1 ...
##  $ A7_Score       : int [1:609] 1 0 1 1 1 0 0 0 1 1 ...
##  $ A8_Score       : int [1:609] 1 1 1 1 1 1 0 1 1 1 ...
##  $ A9_Score       : int [1:609] 0 0 1 0 1 0 1 1 1 1 ...
##  $ A10_Score      : int [1:609] 0 1 1 1 1 0 0 1 0 1 ...
##  $ age            : num [1:609] 26 24 27 35 36 17 64 29 17 33 ...
##  $ gender         : Factor w/ 2 levels "f","m": 1 2 2 1 2 1 2 2 2 2 ...
##  $ ethnicity      : Factor w/ 11 levels "'Middle Eastern '",..: 11 6 6 11 8 4 11 11 3 11 ...
##  $ jundice        : Factor w/ 2 levels "no","yes": 1 1 2 1 2 1 1 1 2 1 ...
##  $ austim         : Factor w/ 2 levels "no","yes": 1 2 2 2 1 1 1 1 2 1 ...
##  $ contry_of_res  : chr [1:609] "'United States'" "Brazil" "Spain" "'United States'" ...
##  $ used_app_before: Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ result         : num [1:609] 6 5 8 6 9 2 5 6 8 10 ...
##  $ age_desc       : chr [1:609] "'18 and more'" "'18 and more'" "'18 and more'" "'18 and more'" ...
##  $ relation       : Factor w/ 5 levels "'Health care professional'",..: 5 5 3 5 5 5 3 5 1 4 ...
##  $ Class.ASD      : Factor w/ 2 levels "NO","YES": 1 1 2 1 2 1 1 1 2 2 ...
##  $ asd            : num [1:609] 0 0 1 0 1 0 0 0 1 1 ...