####WORKPLACE SETTING####
setwd("C:/Users/vitto/Desktop/Autocritica/Analisi dati")
library(magrittr)
library(dplyr)
##
## Caricamento pacchetto: 'dplyr'
## I seguenti oggetti sono mascherati da 'package:stats':
##
## filter, lag
## I seguenti oggetti sono mascherati da 'package:base':
##
## intersect, setdiff, setequal, union
library(tm) #Text mining package, used for removePunctuation()
## Warning: il pacchetto 'tm' è stato creato con R versione 4.4.2
## Caricamento del pacchetto richiesto: NLP
## Warning: il pacchetto 'NLP' è stato creato con R versione 4.4.2
####RAW DATA INPUT####
raw1 <- read.csv("Raw Data/Somministrazione_1.csv")
raw2 <- read.csv("Raw Data/Somministrazione_2.csv")
raw3 <- read.csv("Raw Data/Somministrazione_3.csv")
####SHORTEN COLNAMES####
#Adding shorter names to columns. First 20 and last 50 characters (for some reason shorter strings make columns not unique)
manageable_names <- function(x){
paste0(
substr(colnames(x),1,20),
"......",
substr(colnames(x),nchar(colnames(x))-50,nchar(colnames(x)))
)
}
colnames(raw1) <- manageable_names(raw1)
colnames(raw2) <- manageable_names(raw2)
colnames(raw3) <- manageable_names(raw3)
####CLEANING COLUMNS RAW 1####
clean1 <- raw1 %>%
#Initial columns to eliminate
mutate(
ID.risposta......ID.risposta = NULL,
Ultima.pagina......Ultima.pagina = NULL,
Lingua.iniziale......Lingua.iniziale = NULL,
Seme......Seme = NULL,
Data.di.inizio......Data.di.inizio = NULL,
Data.dell.ultima.azi......Data.dell.ultima.azione = NULL,
Gentilissima.o...l.A.........Grazie.mille.per.il.Suo.prezioso.contributo..... = NULL,
) %>%
#Columns to rename with singe string
rename(
genere = Genere......Genere,
età = Età......Età,
date1 = Data.invio......Data.invio,
MalCronica = Soffre.di.una.malatt......Soffre.di.una.malattia.cronica..Se.sì..quale.,
Psicoterapia = Ha.seguito.un.percor......Ha.seguito.un.percorso.di.psicoterapia.,
codice = Codice.personale...S......nata.il.cinque.gennaio..il.codice.sarebbe.MR0501...
) %>%
#Save date as readable time
mutate(date1 = as.POSIXct(date1,format="%Y-%m-%d %H:%M:%S")) %>%
#FSCRS_tratto
rename_with(
~paste0("FSCRSt_s1_",seq_along(.)),
7:(7+21)
) %>%
#FSCRS_stato_PRE
rename_with(
~paste0("FSCRSs_s1_PRE_",seq_along(.)),
(7+21+1):(7+21+36)
) %>%
#Shame
rename_with(
~paste0("shame",seq_along(.)),
(7+21+36+1):(7+21+36+23)
) %>%
#OpenQ
rename(
OpenQ_1 = Puoi.descrivere.alme......tto.di.te.che.ti.crea.disagio.o.senso.di.vergogna..,
OpenQ_2 = Puoi.descrivere.un.a......etto.di.te.che.ti.crea.disagio.o.senso.di.vergogna.
) %>%
#FSCRS_stato
rename_with(
~paste0("FSCRSs_s1_POST_",seq_along(.)),
(7+21+36+23+2+1):(7+21+36+23+2+36)
) %>%
#Remove remaining cols
select(-((7+21+36+23+2+36+1):ncol(.)))
rm(raw1)
####CLEANING COLUMNS RAW 2####
clean2 <- raw2 %>%
#Prime colonne da togliere
mutate(
ID.risposta......ID.risposta = NULL,
Ultima.pagina......Ultima.pagina = NULL,
Lingua.iniziale......Lingua.iniziale = NULL,
Seme......Seme = NULL,
Data.di.inizio......Data.di.inizio = NULL,
Data.dell.ultima.azi......Data.dell.ultima.azione = NULL
) %>%
rename(date2 = Data.invio......Data.invio,
codice = Codice.personale...S......nata.il.cinque.gennaio..il.codice.sarebbe.MR0501...) %>%
mutate(date2 = as.POSIXct(date2,format="%Y-%m-%d %H:%M:%S")) %>%
rename_with(
~paste0("FSCRSt_s2_",seq_along(.)),
(2+1):(2+22)
) %>%
rename_with(
~paste0("FSCRSs_s2_",seq_along(.)),
(2+22+1):(2+22+36)
) %>%
#Remove remaining cols
select(-((2+22+36+1):ncol(.)))
rm(raw2)
####ClEANING COLUMNS RAW 3####
clean3 <- raw3 %>%
mutate(
ID.risposta......ID.risposta = NULL,
Ultima.pagina......Ultima.pagina = NULL,
Lingua.iniziale......Lingua.iniziale = NULL,
Seme......Seme = NULL,
Data.di.inizio......Data.di.inizio = NULL,
Data.dell.ultima.azi......Data.dell.ultima.azione = NULL
) %>%
rename(date3 = Data.invio......Data.invio,
codice = Codice.personale...S......nata.il.cinque.gennaio..il.codice.sarebbe.MR0501...) %>%
mutate(date3 = as.POSIXct(date3,format="%Y-%m-%d %H:%M:%S")) %>%
rename_with(
~paste0("FSCRSs_s3_",seq_along(.)),
(2+1):(2+36)
)%>%
#Remove remaining cols
select(-((2+36+1):ncol(.)))
rm(raw3)
####SUBSCALES FSCRSt####
clean1 <- clean1 %>%
rename_with(
~paste0("FSCRSt_s1_is_",c(1,2,4,6,7,14,17,18,20)),
6+c(1,2,4,6,7,14,17,18,20)
)%>%
rename_with(
~paste0("FSCRSt_s1_rs_",c(3,5,8,11,13,16,19,21)),
6+c(3,5,8,11,13,16,19,21)
)%>%
rename_with(
~paste0("FSCRSt_s1_hs_",c(9,10,12,15,22)),
6+c(9,10,12,15,22))
clean2 <- clean2 %>%
rename_with(
~paste0("FSCRSt_s2_is_",c(1,2,4,6,7,14,17,18,20)),
2+c(1,2,4,6,7,14,17,18,20)
)%>%
rename_with(
~paste0("FSCRSt_s2_rs_",c(3,5,8,11,13,16,19,21)),
2+c(3,5,8,11,13,16,19,21)
)%>%
rename_with(
~paste0("FSCRSt_s2_hs_",c(9,10,12,15,22)),
2+c(9,10,12,15,22))
####SUBSTITUTE WORDS FOR NUMBERS####
fscrs_numbering <- function(x){
x%>%
gsub("Totalmente vera per me",4,.) %>%
gsub("Alquanto vera per me",3,.) %>% #ci sono delle risposte con un doppio spazio
gsub("Alquanto vera per me",3,.) %>%
gsub("Abbastanza vera per me",2,.) %>%
gsub("Poco vera per me",1,.) %>%
gsub("Per niente vera per me",0,.) %>%
as.numeric
}
shame_numbering <- function(x){
x%>%
gsub("Molto",4,.) %>%
gsub("Abbastanza",3,.) %>%
gsub("Un poco",2,.) %>%
gsub("Per nulla",1,.) %>%
as.numeric
}
numbering <- function(x) {
x %>%
mutate(
across(contains("FSCRS"),fscrs_numbering)
) %>%
mutate(
across(contains("shame"),shame_numbering)
)
}
clean1 <- numbering(clean1)
clean2 <- numbering(clean2)
clean3 <- numbering(clean3)
####IDENTIFICATION CODES####
#Lowercase and remove punctuation
preProc_codes <- function(x){
x %>%
mutate(codice = tolower(codice) %>% removePunctuation() %>% trimws())
}
clean1 <- preProc_codes(clean1)
clean2 <- preProc_codes(clean2)
clean3 <- preProc_codes(clean3)
#Keeping only last instance of the same identification code for first and second administration
# keep_last_ans <- function(x){
# x %>%
# group_by(codice) %>%
# slice_max(order_by=date) %>%
# ungroup
# }
#
# raw1 <- keep_last_ans(raw1)
# raw2 <- keep_last_ans(raw2)
# raw3 <- keep_last_ans(raw3)
#Adding a NA count variable and keeping the rows without NAs (should eliminate partial answers)
keep_noNa_rows <- function(y){
y %T>%
{assign("NAcount",apply(.,1, \(x) sum(is.na(x))),1)}%>%
slice(which(NAcount==0))
}
clean1 <- clean1 %>% keep_noNa_rows
clean2 <- clean2 %>% keep_noNa_rows
clean3 <- clean3 %>% keep_noNa_rows
#Did someone decide to do the questionnaire more times than expected in its entirety?
clean1$codice %>%
table %>%
{which(.!=1)} %>%
{paste("Nel dataset 1",names(.),"ha risposto",unname(.),"volte")}
## [1] "Nel dataset 1 ha risposto volte"
clean2$codice %>%
table %>%
{.[which(.!=1)]} %>%
{paste("Nel dataset 2",names(.),"ha risposto",unname(.),"volte")}
## [1] "Nel dataset 2 ga1003 ha risposto 2 volte"
## [2] "Nel dataset 2 gc3008 ha risposto 2 volte"
## [3] "Nel dataset 2 ls2707 ha risposto 2 volte"
## [4] "Nel dataset 2 mr2509 ha risposto 2 volte"
clean3$codice %>%
table %>%
{.[which(.!=2)]} %>%
{paste("Nel dataset 3",names(.),"ha risposto",unname(.),"volte")}
## [1] "Nel dataset 3 aa0102 ha risposto 1 volte"
## [2] "Nel dataset 3 at2910 ha risposto 1 volte"
## [3] "Nel dataset 3 br0311xx ha risposto 1 volte"
## [4] "Nel dataset 3 bro311xx ha risposto 1 volte"
## [5] "Nel dataset 3 cg3008 ha risposto 1 volte"
## [6] "Nel dataset 3 cv2109 ha risposto 1 volte"
## [7] "Nel dataset 3 dg0205 ha risposto 1 volte"
## [8] "Nel dataset 3 dg0205xx ha risposto 1 volte"
## [9] "Nel dataset 3 ef1104 ha risposto 4 volte"
## [10] "Nel dataset 3 fb2403 ha risposto 1 volte"
## [11] "Nel dataset 3 fc0601 ha risposto 3 volte"
## [12] "Nel dataset 3 fc0601xx ha risposto 1 volte"
## [13] "Nel dataset 3 gc3008 ha risposto 3 volte"
## [14] "Nel dataset 3 gf0705 ha risposto 1 volte"
## [15] "Nel dataset 3 gg1842 ha risposto 1 volte"
## [16] "Nel dataset 3 gp0107 ha risposto 1 volte"
## [17] "Nel dataset 3 gp2203 ha risposto 3 volte"
## [18] "Nel dataset 3 lc0208 ha risposto 1 volte"
## [19] "Nel dataset 3 lr0314xx ha risposto 1 volte"
## [20] "Nel dataset 3 lr1403xx ha risposto 1 volte"
## [21] "Nel dataset 3 luqua1212 ha risposto 1 volte"
## [22] "Nel dataset 3 mc0206 ha risposto 1 volte"
## [23] "Nel dataset 3 mr2509 ha risposto 1 volte"
## [24] "Nel dataset 3 pst0307 ha risposto 1 volte"
## [25] "Nel dataset 3 sg0508 ha risposto 1 volte"
## [26] "Nel dataset 3 vt2907 ha risposto 1 volte"
#In third dataset separating rows based on first and last date for each code
clean3_sliced <- clean3 %>%
slice_max(order_by = date3,by = codice)
clean4 <- clean3 %>%
slice_min(order_by = date3,by = codice)
#Adding different colnames
colnames(clean4) <- colnames(clean4) %>%
gsub("date3","date4",.) %>%
gsub("_3_","_4_",.)
####QUALI CODICI MANCANO?####
compare_codes <- function(x){
(x$codice %in% clean1$codice) &
(x$codice %in% clean2$codice) &
(x$codice %in% clean3_sliced$codice) &
(x$codice %in% clean4$codice)
}
c(
clean1$codice[!compare_codes(clean1)],
clean2$codice[!compare_codes(clean2)],
clean3_sliced$codice[!compare_codes(clean3_sliced)],
clean4$codice[!compare_codes(clean4)]
) %>%
unique %>%
paste(collapse=",") %>%
paste("Codici per cui mancano somministrazioni:",.)
## [1] "Codici per cui mancano somministrazioni: cp2608,ag1003,cp1752,gp0310,ap1804,tm0103,lv1505,af2612,rp2809,fg0906,fp1985lr,tr0405,giu57,ga3007,mc0204,ms1410x,lr0314xx,pd2203,lo1102,am0501,lq1212,mu1506,ca0201,ma1403,ep0207,rg2702,am1101,lc2909,pr2809,mg2508,giu56,mr0204,lr1403xx,ms1410xx,df0663,cg3008,bro311xx,dp2203,dg0205,luqua1212"
####FIXING CODES####
#which codes are unmatched through the 3 administrations?
codes <- c(clean1$codice[!compare_codes(clean1)],
clean2$codice[!compare_codes(clean2)],
clean3_sliced$codice[!compare_codes(clean3_sliced)],
clean4$codice[!compare_codes(clean4)]
) %>%
unique
#Calculate the approximate string distance
match <- adist(codes)
colnames(match) <- codes
rownames(match) <- codes
#create side by side view of codes with a distance lower than 3
good_matches <- which(match<3 & match>0) %>%
arrayInd(dim(match)) %>%
{cbind(
rownames(match)[.[,1]],
colnames(match)[.[,2]])}
#update variable with remaining codes
codes <- codes[!codes%in%c(good_matches)]
#remake matrix
match <- adist(codes)
colnames(match) <- codes
rownames(match) <- codes
#matching codes with a distance of 3
matches <- which(match<4 & match>0) %>%
arrayInd(dim(match)) %>%
{cbind(
rownames(match)[.[,1]],
colnames(match)[.[,2]])}
matches %>%
t %>%
write.csv("matches.csv")
#The only match that could make sense with a distance greater than 3 is luqua1212 and lq1212
#Renaming codes
old_codes <- c("pr2809","giu56","mr0204","ms1410xx","dp2203","am1101","luqua1212")
new_codes <- c("rp2809","giu57","mc0204","ms1410x","pd2203","am0501","lq1212")
for (i in 1:length(old_codes)){
clean1 <- clean1 %>%
apply(2,\(x)gsub(old_codes[i],new_codes[i],x)) %>%
data.frame
clean2 <- clean2 %>%
apply(2,\(x)gsub(old_codes[i],new_codes[i],x)) %>%
data.frame
clean3_sliced <- clean3_sliced %>%
apply(2,\(x)gsub(old_codes[i],new_codes[i],x)) %>%
data.frame
clean4 <- clean4 %>%
apply(2,\(x)gsub(old_codes[i],new_codes[i],x)) %>%
data.frame
}
####UPDATING NAMES FOR 4th DATASET####
colnames(clean4) <- colnames(clean4) %>%
gsub("s3","s4",.)
####UNIONE DEI QUATTRO DATASET####
dati <- clean1 %>%
inner_join(clean2,by="codice",multiple = "last") %>%
inner_join(clean3_sliced,by="codice",multiple = "last") %>%
inner_join(clean4,by="codice",multiple = "last")
#Removing open questions for better visualisation
dati <- dati %>%
select(-contains("OpenQ"))
write.csv(dati,"clean_original_data.csv",row.names = F)