set environment
# Set rendering parameters
knitr::opts_chunk$set(message = FALSE, warning = FALSE)
# Load packages
library(dplyr)
library(tidyverse)
gitHubPath <- file.path("/Users", "hanalockwood", "Downloads", "MasterStatsUsingR-main")
("~/Downloads/MasterStatsUsingR-main/data/restingEEG.csv")
## [1] "~/Downloads/MasterStatsUsingR-main/data/restingEEG.csv"
Load in data and repeat some merging from problem set 3
#real data from a large project to study people with autism.
#more information about these data can be found here:
# https://medicine.yale.edu/ycci/clinicaltrials/categories/children/autism-biomarkers-consortium/
#eegDat includes data from other studies as well. You can read more about
#these eeg data here:
#https://molecularautism.biomedcentral.com/articles/10.1186/s13229-025-00647-3
eegDat = read_csv(file.path(gitHubPath, "data",
"restingEEG.csv"))
#load in behavioral data about participants:
behDat = read.csv(file.path("/Users", "hanalockwood", "Downloads", 'vinelandSummary.csv'))
#/Users/hanalockwood/Downloads
#These data are drawn from the Vineland-3 Adaptive Behavior Scale
#1) filter down to only eegDat from the Autism biomarkers consortium
# for clinical trials dataSet.
# This dataSet is referred to by the short name: biomarkCon
eegDat <- eegDat %>% filter(dataSet == "biomarkCon")
#2) Merge the eegDat and behDat dataframes into a single allDat dataframe.
allDat = merge(eegDat,behDat)
#3) Transform the group variable in allDat so that rather than having AD, ASD,
# and CON participants, there are only AD and CON participants. Make all ASD
# participants be AD instead.
allDat$group[allDat$group == "ASD"] <- "AD"
#4) Write code that verifies whether or not there is exactly one row per
# participant in the data frame
nrow(allDat) == length(unique(allDat$participantID))
## [1] FALSE
#Console showed [1] FALSE. This means there are duplicates.
# Now we need to get rid of the existing duplicates:
colnames(allDat)
## [1] "...1" "name"
## [3] "folder" "date"
## [5] "bytes" "isdir"
## [7] "datenum" "dataSet"
## [9] "sex" "age"
## [11] "eyes" "group"
## [13] "key" "origfile"
## [15] "nbTrialOrig" "nbTrialFinal"
## [17] "nbChanOrig" "nbChanFinal"
## [19] "IQ" "IQ_measure"
## [21] "pow_delta_rF" "pow_theta_rF"
## [23] "pow_alpha_rF" "pow_beta_rF"
## [25] "pow_gam1_rF" "pow_gam2_rF"
## [27] "pow_delta_lF" "pow_theta_lF"
## [29] "pow_alpha_lF" "pow_beta_lF"
## [31] "pow_gam1_lF" "pow_gam2_lF"
## [33] "pow_delta_rCP" "pow_theta_rCP"
## [35] "pow_alpha_rCP" "pow_beta_rCP"
## [37] "pow_gam1_rCP" "pow_gam2_rCP"
## [39] "pow_delta_lCP" "pow_theta_lCP"
## [41] "pow_alpha_lCP" "pow_beta_lCP"
## [43] "pow_gam1_lCP" "pow_gam2_lCP"
## [45] "pow_delta_rOP" "pow_theta_rOP"
## [47] "pow_alpha_rOP" "pow_beta_rOP"
## [49] "pow_gam1_rOP" "pow_gam2_rOP"
## [51] "pow_delta_lOP" "pow_theta_lOP"
## [53] "pow_alpha_lOP" "pow_beta_lOP"
## [55] "pow_gam1_lOP" "pow_gam2_lOP"
## [57] "pow_delta_F" "pow_theta_F"
## [59] "pow_alpha_F" "pow_beta_F"
## [61] "pow_gam1_F" "pow_gam2_F"
## [63] "pow_delta_O" "pow_theta_O"
## [65] "pow_alpha_O" "pow_beta_O"
## [67] "pow_gam1_O" "pow_gam2_O"
## [69] "pow_delta_C" "pow_theta_C"
## [71] "pow_alpha_C" "pow_beta_C"
## [73] "pow_gam1_C" "pow_gam2_C"
## [75] "pow_delta_lL" "pow_theta_lL"
## [77] "pow_alpha_lL" "pow_beta_lL"
## [79] "pow_gam1_lL" "pow_gam2_lL"
## [81] "pow_delta_rL" "pow_theta_rL"
## [83] "pow_alpha_rL" "pow_beta_rL"
## [85] "pow_gam1_rL" "pow_gam2_rL"
## [87] "pow_delta_rH" "pow_theta_rH"
## [89] "pow_alpha_rH" "pow_beta_rH"
## [91] "pow_gam1_rH" "pow_gam2_rH"
## [93] "pow_delta_lH" "pow_theta_lH"
## [95] "pow_alpha_lH" "pow_beta_lH"
## [97] "pow_gam1_lH" "pow_gam2_lH"
## [99] "X" "subjectkey"
## [101] "collection_id" "vinland301_id"
## [103] "dataset_id" "interview_age"
## [105] "vi3_rec_r27" "vi3_rec_r28"
## [107] "vi3_rec_r29" "vi3_rec_r30"
## [109] "receptive_right_left" "receptive_16"
## [111] "vi3_rec_r33" "vi3_rec_r34"
## [113] "vi3_rec_r35" "vi3_rec_r36"
## [115] "vi3_rec_r37" "vi3_rec_r38"
## [117] "vi3_rec_r39" "expressive_37"
## [119] "vi3_exp_r41" "vi3_exp_r42"
## [121] "expressive_know_opinions" "expressive_42"
## [123] "vi3_exp_r45" "expressive_40"
## [125] "vi3_exp_r47" "expressive_51"
## [127] "vi3_exp_r49" "written_12"
## [129] "written_13" "written_14"
## [131] "vi3_wrn_r21" "vi3_wrn_r22"
## [133] "vi3_wrn_r23" "vi3_wrn_r24"
## [135] "vi3_wrn_r25" "vi3_wrn_r26"
## [137] "written_17" "vi3_wrn_r28"
## [139] "vi3_wrn_r29" "vi3_wrn_r30"
## [141] "vi3_wrn_r31" "written_21"
## [143] "vi3_wrn_r33" "personal_28"
## [145] "vi3_per_r40" "personal_33"
## [147] "personal_30" "personal_31"
## [149] "vi3_per_r44" "vi3_per_r45"
## [151] "vi3_per_r46" "vi3_per_r47"
## [153] "vi3_per_r48" "vi3_per_r49"
## [155] "vi3_per_r50" "vi3_per_r51"
## [157] "personal_37" "personal_36"
## [159] "domestic_1" "domestic_6"
## [161] "vi3_dom_r03" "vi3_dom_r04"
## [163] "vi3_dom_r05" "domestic_5"
## [165] "vi3_dom_r07" "vi3_dom_r08"
## [167] "domestic_2" "vi3_dom_r10"
## [169] "vi3_dom_r11" "vi3_dom_r12"
## [171] "domestic_11" "vi3_dom_r14"
## [173] "vi3_dom_r15" "vi3_dom_r16"
## [175] "vi3_dom_r17" "vi3_dom_r18"
## [177] "vi3_dom_r19" "domestic_16"
## [179] "community_19" "vi3_cmm_r21"
## [181] "community_21" "community_18"
## [183] "copingskills_22" "vi3_cmm_r25"
## [185] "vi3_cmm_r26" "vi3_cmm_r27"
## [187] "community_29" "vi3_cmm_r29"
## [189] "vi3_cmm_r30" "vi3_cmm_r31"
## [191] "vi3_cmm_r32" "vi3_cmm_r33"
## [193] "community_26" "vi3_cmm_r35"
## [195] "vi3_cmm_r36" "vi3_cmm_r37"
## [197] "vi3_cmm_r38" "vi3_ipr_r24"
## [199] "vi3_ipr_r25" "interpersrelation_26"
## [201] "vi3_ipr_r27" "interpersrelation_27"
## [203] "vi3_ipr_r29" "vi3_ipr_r30"
## [205] "interpersrelation_28" "vi3_ipr_r32"
## [207] "vi3_ipr_r33" "copingskills_15"
## [209] "vi3_ipr_r35" "vi3_ipr_r36"
## [211] "vi3_ipr_r37" "vi3_ipr_r38"
## [213] "vi3_ipr_r39" "interpersrelation_36"
## [215] "vi3_ipr_r41" "interpersrelation_35"
## [217] "vi3_ipr_r43" "vi3_pla_r21"
## [219] "vi3_pla_r22" "vine_soc_gamerls"
## [221] "playleisuretime_20" "vi3_pla_r25"
## [223] "playleisuretime_22" "vi3_pla_r27"
## [225] "vi3_pla_r28" "playleisuretime_23"
## [227] "vi3_pla_r30" "vi3_pla_r31"
## [229] "vi3_pla_r32" "vi3_pla_r33"
## [231] "playleisuretime_29" "playleisuretime_30"
## [233] "vi3_cop_r08" "vi3_cop_r09"
## [235] "vi3_cop_r10" "copingskills_11"
## [237] "vi3_cop_r12" "copingskills_3"
## [239] "copingskills_16" "copingskills_13"
## [241] "copingskills_action" "vi3_cop_r17"
## [243] "copingskills_17" "vi3_cop_r19"
## [245] "vi3_cop_r20" "vi3_cop_r21"
## [247] "copingskills_23" "vi3_cop_r23"
## [249] "copingskills_26" "vi3_cop_r25"
## [251] "copingskills_30" "copingskills_25"
## [253] "vi3_cop_r28" "copingskills_28"
## [255] "vi3_cop_r30" "copingskills_29"
## [257] "receptivesubdomain_1" "expressivesubdomain_2"
## [259] "writtensubdomain_3" "personalsubdomain_1"
## [261] "domesticsubdomain_2" "communitysubdomain_3"
## [263] "interpersrelationsubdom_1" "playleisuretimesubdomain_2"
## [265] "copingskillssubdomain_3" "grosssubdomain_1"
## [267] "finesubdomain_2" "mbiinternalizingsubdomain_1"
## [269] "mbiexternalizingsubdomain_2" "vi3_rec_est"
## [271] "vi3_exp_est" "vi3_wrn_est"
## [273] "vi3_per_est" "vi3_dom_est"
## [275] "vi3_cmm_est" "vi3_ipr_est"
## [277] "vi3_pla_est" "vi3_cop_est"
## [279] "communicationdomain_total" "livingskillsdomain_total"
## [281] "socializationdomain_total" "receptive_vscore"
## [283] "expressive_vscore" "written_vscore"
## [285] "personal_vscore" "domestic_vscore"
## [287] "community_vscore" "interprltn_vscore"
## [289] "playleis_vscore" "copingskill_vscore"
## [291] "communicationdomain_totalb" "dailylivsk_stnd_score"
## [293] "socializationdomain_totalb" "composite_totalb"
## [295] "communicationdomain_totald" "dailylivsk_conf_interv"
## [297] "socializationdomain_totald" "composite_totald"
## [299] "vi3_com_cil" "vi3_dls_cil"
## [301] "vi3_soc_cil" "vi3_abc_cil"
## [303] "vi3_com_cih" "vi3_dls_cih"
## [305] "vi3_soc_cih" "vi3_abc_cih"
## [307] "vi3_rec_gsv" "vi3_exp_gsv"
## [309] "vi3_wrn_gsv" "vi3_per_gsv"
## [311] "vi3_dom_gsv" "vi3_cmm_gsv"
## [313] "vi3_ipr_gsv" "vi3_pla_gsv"
## [315] "vi3_cop_gsv"
allDat_clean <- allDat[!duplicated(allDat$participantID), ]
#Check:
nrow(allDat_clean) == length(unique(allDat_clean$participantID))
## [1] TRUE
#5) how many variables are available in the allDat dataframe? 315 variables
colnames(allDat)
## [1] "...1" "name"
## [3] "folder" "date"
## [5] "bytes" "isdir"
## [7] "datenum" "dataSet"
## [9] "sex" "age"
## [11] "eyes" "group"
## [13] "key" "origfile"
## [15] "nbTrialOrig" "nbTrialFinal"
## [17] "nbChanOrig" "nbChanFinal"
## [19] "IQ" "IQ_measure"
## [21] "pow_delta_rF" "pow_theta_rF"
## [23] "pow_alpha_rF" "pow_beta_rF"
## [25] "pow_gam1_rF" "pow_gam2_rF"
## [27] "pow_delta_lF" "pow_theta_lF"
## [29] "pow_alpha_lF" "pow_beta_lF"
## [31] "pow_gam1_lF" "pow_gam2_lF"
## [33] "pow_delta_rCP" "pow_theta_rCP"
## [35] "pow_alpha_rCP" "pow_beta_rCP"
## [37] "pow_gam1_rCP" "pow_gam2_rCP"
## [39] "pow_delta_lCP" "pow_theta_lCP"
## [41] "pow_alpha_lCP" "pow_beta_lCP"
## [43] "pow_gam1_lCP" "pow_gam2_lCP"
## [45] "pow_delta_rOP" "pow_theta_rOP"
## [47] "pow_alpha_rOP" "pow_beta_rOP"
## [49] "pow_gam1_rOP" "pow_gam2_rOP"
## [51] "pow_delta_lOP" "pow_theta_lOP"
## [53] "pow_alpha_lOP" "pow_beta_lOP"
## [55] "pow_gam1_lOP" "pow_gam2_lOP"
## [57] "pow_delta_F" "pow_theta_F"
## [59] "pow_alpha_F" "pow_beta_F"
## [61] "pow_gam1_F" "pow_gam2_F"
## [63] "pow_delta_O" "pow_theta_O"
## [65] "pow_alpha_O" "pow_beta_O"
## [67] "pow_gam1_O" "pow_gam2_O"
## [69] "pow_delta_C" "pow_theta_C"
## [71] "pow_alpha_C" "pow_beta_C"
## [73] "pow_gam1_C" "pow_gam2_C"
## [75] "pow_delta_lL" "pow_theta_lL"
## [77] "pow_alpha_lL" "pow_beta_lL"
## [79] "pow_gam1_lL" "pow_gam2_lL"
## [81] "pow_delta_rL" "pow_theta_rL"
## [83] "pow_alpha_rL" "pow_beta_rL"
## [85] "pow_gam1_rL" "pow_gam2_rL"
## [87] "pow_delta_rH" "pow_theta_rH"
## [89] "pow_alpha_rH" "pow_beta_rH"
## [91] "pow_gam1_rH" "pow_gam2_rH"
## [93] "pow_delta_lH" "pow_theta_lH"
## [95] "pow_alpha_lH" "pow_beta_lH"
## [97] "pow_gam1_lH" "pow_gam2_lH"
## [99] "X" "subjectkey"
## [101] "collection_id" "vinland301_id"
## [103] "dataset_id" "interview_age"
## [105] "vi3_rec_r27" "vi3_rec_r28"
## [107] "vi3_rec_r29" "vi3_rec_r30"
## [109] "receptive_right_left" "receptive_16"
## [111] "vi3_rec_r33" "vi3_rec_r34"
## [113] "vi3_rec_r35" "vi3_rec_r36"
## [115] "vi3_rec_r37" "vi3_rec_r38"
## [117] "vi3_rec_r39" "expressive_37"
## [119] "vi3_exp_r41" "vi3_exp_r42"
## [121] "expressive_know_opinions" "expressive_42"
## [123] "vi3_exp_r45" "expressive_40"
## [125] "vi3_exp_r47" "expressive_51"
## [127] "vi3_exp_r49" "written_12"
## [129] "written_13" "written_14"
## [131] "vi3_wrn_r21" "vi3_wrn_r22"
## [133] "vi3_wrn_r23" "vi3_wrn_r24"
## [135] "vi3_wrn_r25" "vi3_wrn_r26"
## [137] "written_17" "vi3_wrn_r28"
## [139] "vi3_wrn_r29" "vi3_wrn_r30"
## [141] "vi3_wrn_r31" "written_21"
## [143] "vi3_wrn_r33" "personal_28"
## [145] "vi3_per_r40" "personal_33"
## [147] "personal_30" "personal_31"
## [149] "vi3_per_r44" "vi3_per_r45"
## [151] "vi3_per_r46" "vi3_per_r47"
## [153] "vi3_per_r48" "vi3_per_r49"
## [155] "vi3_per_r50" "vi3_per_r51"
## [157] "personal_37" "personal_36"
## [159] "domestic_1" "domestic_6"
## [161] "vi3_dom_r03" "vi3_dom_r04"
## [163] "vi3_dom_r05" "domestic_5"
## [165] "vi3_dom_r07" "vi3_dom_r08"
## [167] "domestic_2" "vi3_dom_r10"
## [169] "vi3_dom_r11" "vi3_dom_r12"
## [171] "domestic_11" "vi3_dom_r14"
## [173] "vi3_dom_r15" "vi3_dom_r16"
## [175] "vi3_dom_r17" "vi3_dom_r18"
## [177] "vi3_dom_r19" "domestic_16"
## [179] "community_19" "vi3_cmm_r21"
## [181] "community_21" "community_18"
## [183] "copingskills_22" "vi3_cmm_r25"
## [185] "vi3_cmm_r26" "vi3_cmm_r27"
## [187] "community_29" "vi3_cmm_r29"
## [189] "vi3_cmm_r30" "vi3_cmm_r31"
## [191] "vi3_cmm_r32" "vi3_cmm_r33"
## [193] "community_26" "vi3_cmm_r35"
## [195] "vi3_cmm_r36" "vi3_cmm_r37"
## [197] "vi3_cmm_r38" "vi3_ipr_r24"
## [199] "vi3_ipr_r25" "interpersrelation_26"
## [201] "vi3_ipr_r27" "interpersrelation_27"
## [203] "vi3_ipr_r29" "vi3_ipr_r30"
## [205] "interpersrelation_28" "vi3_ipr_r32"
## [207] "vi3_ipr_r33" "copingskills_15"
## [209] "vi3_ipr_r35" "vi3_ipr_r36"
## [211] "vi3_ipr_r37" "vi3_ipr_r38"
## [213] "vi3_ipr_r39" "interpersrelation_36"
## [215] "vi3_ipr_r41" "interpersrelation_35"
## [217] "vi3_ipr_r43" "vi3_pla_r21"
## [219] "vi3_pla_r22" "vine_soc_gamerls"
## [221] "playleisuretime_20" "vi3_pla_r25"
## [223] "playleisuretime_22" "vi3_pla_r27"
## [225] "vi3_pla_r28" "playleisuretime_23"
## [227] "vi3_pla_r30" "vi3_pla_r31"
## [229] "vi3_pla_r32" "vi3_pla_r33"
## [231] "playleisuretime_29" "playleisuretime_30"
## [233] "vi3_cop_r08" "vi3_cop_r09"
## [235] "vi3_cop_r10" "copingskills_11"
## [237] "vi3_cop_r12" "copingskills_3"
## [239] "copingskills_16" "copingskills_13"
## [241] "copingskills_action" "vi3_cop_r17"
## [243] "copingskills_17" "vi3_cop_r19"
## [245] "vi3_cop_r20" "vi3_cop_r21"
## [247] "copingskills_23" "vi3_cop_r23"
## [249] "copingskills_26" "vi3_cop_r25"
## [251] "copingskills_30" "copingskills_25"
## [253] "vi3_cop_r28" "copingskills_28"
## [255] "vi3_cop_r30" "copingskills_29"
## [257] "receptivesubdomain_1" "expressivesubdomain_2"
## [259] "writtensubdomain_3" "personalsubdomain_1"
## [261] "domesticsubdomain_2" "communitysubdomain_3"
## [263] "interpersrelationsubdom_1" "playleisuretimesubdomain_2"
## [265] "copingskillssubdomain_3" "grosssubdomain_1"
## [267] "finesubdomain_2" "mbiinternalizingsubdomain_1"
## [269] "mbiexternalizingsubdomain_2" "vi3_rec_est"
## [271] "vi3_exp_est" "vi3_wrn_est"
## [273] "vi3_per_est" "vi3_dom_est"
## [275] "vi3_cmm_est" "vi3_ipr_est"
## [277] "vi3_pla_est" "vi3_cop_est"
## [279] "communicationdomain_total" "livingskillsdomain_total"
## [281] "socializationdomain_total" "receptive_vscore"
## [283] "expressive_vscore" "written_vscore"
## [285] "personal_vscore" "domestic_vscore"
## [287] "community_vscore" "interprltn_vscore"
## [289] "playleis_vscore" "copingskill_vscore"
## [291] "communicationdomain_totalb" "dailylivsk_stnd_score"
## [293] "socializationdomain_totalb" "composite_totalb"
## [295] "communicationdomain_totald" "dailylivsk_conf_interv"
## [297] "socializationdomain_totald" "composite_totald"
## [299] "vi3_com_cil" "vi3_dls_cil"
## [301] "vi3_soc_cil" "vi3_abc_cil"
## [303] "vi3_com_cih" "vi3_dls_cih"
## [305] "vi3_soc_cih" "vi3_abc_cih"
## [307] "vi3_rec_gsv" "vi3_exp_gsv"
## [309] "vi3_wrn_gsv" "vi3_per_gsv"
## [311] "vi3_dom_gsv" "vi3_cmm_gsv"
## [313] "vi3_ipr_gsv" "vi3_pla_gsv"
## [315] "vi3_cop_gsv"
Describe and guess distribution type
#6) Create a dataset with only the AD participants using the filter function.
# name the new dataframe you create AD_only
AD_only <- eegDat %>% filter(dataSet == "AD")
#7 Copy paste the code from CompareRealToAnalytic_complete.Rmd that calcualtes
# the summary descriptive statistics for a variable of interest. Adapt this code
# to calculate the descriptive statistics for the same Vineland survey item
# that you worked with in questions 10 through the end in problem set 3.
# call your summary dataframe sumDat.
# Hint: if the minimum of your variable is greater than 0 then your offset
# can be zero
sumDat <- behDat %>% summarize(setName = 'vi3_cop_gsv',
meanVal = mean(vi3_cop_gsv),
medianVal = median(vi3_cop_gsv),
sdVal = sd(vi3_cop_gsv),
logMean = mean(log(vi3_cop_gsv)),
logSD = sd(log(vi3_cop_gsv)),
offSet = 0)
#8 copy paste the code defining the plotAgainstDists function from
# CompareRealToAnalytic_complete.Rmd. Use this function to plot your chosen
# variable using the sumDat summary data that you calculated in the last
# question against different distributions
# make the title informative including what subscore you are plotting and what
# group.
# Hint: what's in the set environment chunk of CompareRealToAnalytic_complete.Rmd
# that you will need to make plotAgainstDists work?
plotAgainstDists <- function(behDat,
varOfInterest, sumVals,
myTitle, binCount = 50)
ggplot(behDat, aes(x = vi3_cop_gsv)) +
geom_histogram(aes(y = after_stat(density)),
bins = 50,
fill = "lightblue",
color = "black") +
stat_function(fun = dnorm,
args = list(mean = sumDat$meanVal,
sd = sumDat$sdVal),
color = "red",
linewidth = 1) +
stat_function(fun = dlnorm,
args = list(meanlog = sumDat$logMean,
sdlog = sumDat$logSD),
color = "darkgreen",
linewidth = 1) +
labs(title = "Distribution of Vineland Coping GSV (Full Sample)",
x = "Vineland Coping GSV",
y = "Density") +
theme_minimal()
#9 Calculate all of the descriptive measures that you know how to calculate for
# this variable: mean, median, mode, sd, mean absolute deviation,
# interquartile range, q25, and q75. Put all of this information into a neat table
# 1 row by 8 values. Use the kableExtra package to make a nicely formatted
# table. (see describeRealData_complete.Rmd)
library("kableExtra")
Mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
x <- behDat$vi3_cop_gsv
mean_val <- mean(x, na.rm = TRUE)
median_val <- median(x, na.rm = TRUE)
mode_val <- Mode(x)
sd_val <- sd(x, na.rm = TRUE)
mad_val <- mad(x, constant = 1, na.rm = TRUE)
iqr_val <- IQR(x, na.rm = TRUE)
q25_val <- quantile(x, 0.25, na.rm = TRUE)
q75_val <- quantile(x, 0.75, na.rm = TRUE)
descTable <- data.frame(
Mean = mean_val,
Median = median_val,
Mode = mode_val,
SD = sd_val,
MAD = mad_val,
IQR = iqr_val,
Q25 = q25_val,
Q75 = q75_val
)
#10 Synthesizing across what you've seen in your results so far, what
# distribution do you think best describes these data? Why? The log normal curve, represented in my plot as the green curve seems to be the most accurate distribution because it accounts for the skew of the graph. The log normal curve represents this skew better than the normal curve would.
Assessing an EEG variable
#11) this time, let's work with allDat (not split by groups) and examine the
# pow_theta_F variable. There are outliers in this dataset. Can you filter
# out all participants with values greater than 250 on this variable?
allDat <- allDat %>% filter(pow_theta_F <= 250)
#12) Can you create a sumDat dataframe for this variable
# in the same format as you did for 7? Note, as long as the minimum is greater
# than zero, the offset is zero.
sumDat <- allDat %>% summarize(setName = 'pow_theta_F',
meanVal = mean(pow_theta_F),
medianVal = median(pow_theta_F),
sdVal = sd(pow_theta_F),
logMean = mean(log(pow_theta_F)),
logSD = sd(log(pow_theta_F)),
offSet = 0)
#13) plot pow_theta_F against analytic distributions similarly to how you
# did in problem 8. There is no need to copy paste the function a second time
plotAgainstDists <- function(allDat,
varOfInterest,
sumVals,
myTitle,
binCount = 50) {
ggplot(allDat, aes(x = {{varOfInterest}})) +
geom_histogram(aes(y = after_stat(density)),
bins = binCount,
fill = "lightblue",
color = "black") +
stat_function(fun = dnorm,
args = list(mean = sumVals$meanVal,
sd = sumVals$sdVal),
color = "red",
linewidth = 1) +
stat_function(fun = dlnorm,
args = list(meanlog = sumVals$logMean,
sdlog = sumVals$logSD),
color = "darkgreen",
linewidth = 1) +
labs(title = "Distribution of Frontal Theta Power",
x = "Frontal Theta Power",
y = "Density") +
theme_minimal()
}
#14) Create a nicely formatted table of all your descriptive statistics
# similar to what you did in problem 9
Mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
x <- allDat$pow_theta_F
mean_val <- mean(x)
median_val <- median(x)
mode_val <- Mode(x)
sd_val <- sd(x)
mad_val <- mad(x)
iqr_val <- IQR(x)
q25_val <- quantile(x)
q75_val <- quantile(x)
descTable <- data.frame(
Mean = mean_val,
Median = median_val,
Mode = mode_val,
SD = sd_val,
MAD = mad_val,
IQR = iqr_val,
Q25 = q25_val,
Q75 = q75_val
)
#15) Synthesize what you've seen about the pow_theta_F variable. what
# distribution do you think best describes these data? Why? The log normal curve, represented by the green curve on the plot is the best distribution for describing the data because it is better at encapsulating the skewed shape. Specifically, it closely aligns with the peak near the lower values as well as the long upper tail.
A couple concept questions
#16) What does an analytic distribution represent? It shows different curves in comparison to a histogram in the practices for this problem set. Overall, it summarizes patterns in the data and shows whether or not the data follows a known statistical distribution.
#17) If multiple distributions all seem to fit a dataset pretty well, then
# what distribution would you tend to want to use? why? Whichever distribution best matches the underlying theory of the dataset and supports valid, interpretable statistical inference is the one you should choose. Finally, choosing the most simple model is important as well.
#18) Are parametric or non-parametric variability measures going to be better
# for a log-normally distributed variable? Why? Non-parametric variability measures are better for a log-normally distributed variable. This is because non-parametric measures are not distorted by the skew and are less sensitive to extreme values.