r Sys.Date()
drug.use.by.age.1 <- read.csv("~/R/LINKAGEDRS/PEC1/drug-use-by-age-1.csv", quote="")
View(drug.use.by.age)
drug_use <- drug.use.by.age.1
View(drug_use)
library(knitr)
kable(head(drug_use, 5))
kable(tail(drug_use, 5))
age | n | alcohol.use | alcohol.frequency | marijuana.use | marijuana.frequency | cocaine.use | cocaine.frequency | crack.use | crack.frequency | heroin.use | heroin.frequency | hallucinogen.use | hallucinogen.frequency | inhalant.use | inhalant.frequency | pain.releiver.use | pain.releiver.frequency | oxycontin.use | oxycontin.frequency | tranquilizer.use | tranquilizer.frequency | stimulant.use | stimulant.frequency | meth.use | meth.frequency | sedative.use | sedative.frequency |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
12 | 2798 | 3.9 | 3 | 1.1 | 4 | 0.1 | 5.0 | 0.0 | NA | 0.1 | 35.5 | 0.2 | 52 | 1.6 | 19.0 | 2.0 | 36 | 0.1 | 24.5 | 0.2 | 52.0 | 0.2 | 2.0 | 0.0 | NA | 0.2 | 13.0 |
13 | 2757 | 8.5 | 6 | 3.4 | 15 | 0.1 | 1.0 | 0.0 | 3.0 | 0.0 | NA | 0.6 | 6 | 2.5 | 12.0 | 2.4 | 14 | 0.1 | 41.0 | 0.3 | 25.5 | 0.3 | 4.0 | 0.1 | 5.0 | 0.1 | 19.0 |
14 | 2792 | 18.1 | 5 | 8.7 | 24 | 0.1 | 5.5 | 0.0 | NA | 0.1 | 2.0 | 1.6 | 3 | 2.6 | 5.0 | 3.9 | 12 | 0.4 | 4.5 | 0.9 | 5.0 | 0.8 | 12.0 | 0.1 | 24.0 | 0.2 | 16.5 |
15 | 2956 | 29.2 | 6 | 14.5 | 25 | 0.5 | 4.0 | 0.1 | 9.5 | 0.2 | 1.0 | 2.1 | 4 | 2.5 | 5.5 | 5.5 | 10 | 0.8 | 3.0 | 2.0 | 4.5 | 1.5 | 6.0 | 0.3 | 10.5 | 0.4 | 30.0 |
16 | 3058 | 40.1 | 10 | 22.5 | 30 | 1.0 | 7.0 | 0.0 | 1.0 | 0.1 | 66.5 | 3.4 | 3 | 3.0 | 3.0 | 6.2 | 7 | 1.1 | 4.0 | 2.4 | 11.0 | 1.8 | 9.5 | 0.3 | 36.0 | 0.2 | 3.0 |
age | n | alcohol.use | alcohol.frequency | marijuana.use | marijuana.frequency | cocaine.use | cocaine.frequency | crack.use | crack.frequency | heroin.use | heroin.frequency | hallucinogen.use | hallucinogen.frequency | inhalant.use | inhalant.frequency | pain.releiver.use | pain.releiver.frequency | oxycontin.use | oxycontin.frequency | tranquilizer.use | tranquilizer.frequency | stimulant.use | stimulant.frequency | meth.use | meth.frequency | sedative.use | sedative.frequency | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
13 | 26-29 | 2628 | 80.7 | 52 | 20.8 | 52 | 3.2 | 5 | 0.4 | 6 | 0.6 | 50 | 3.2 | 3 | 0.6 | 4.0 | 8.3 | 13 | 1.2 | 13.5 | 4.2 | 10 | 2.3 | 7 | 0.6 | 30 | 0.4 | 4 |
14 | 30-34 | 2864 | 77.5 | 52 | 16.4 | 72 | 2.1 | 8 | 0.5 | 15 | 0.4 | 66 | 1.8 | 2 | 0.4 | 3.5 | 5.9 | 22 | 0.9 | 46.0 | 3.6 | 8 | 1.4 | 12 | 0.4 | 54 | 0.4 | 10 |
15 | 35-49 | 7391 | 75.0 | 52 | 10.4 | 48 | 1.5 | 15 | 0.5 | 48 | 0.1 | 280 | 0.6 | 3 | 0.3 | 10.0 | 4.2 | 12 | 0.3 | 12.0 | 1.9 | 6 | 0.6 | 24 | 0.2 | 104 | 0.3 | 10 |
16 | 50-64 | 3923 | 67.2 | 52 | 7.3 | 52 | 0.9 | 36 | 0.4 | 62 | 0.1 | 41 | 0.3 | 44 | 0.2 | 13.5 | 2.5 | 12 | 0.4 | 5.0 | 1.4 | 10 | 0.3 | 24 | 0.2 | 30 | 0.2 | 104 |
17 | 65+ | 2448 | 49.3 | 52 | 1.2 | 36 | 0.0 | NA | 0.0 | NA | 0.0 | 120 | 0.1 | 2 | 0.0 | NA | 0.6 | 24 | 0.0 | NA | 0.2 | 5 | 0.0 | 364 | 0.0 | NA | 0.0 | 15 |
dim(drug_use)
colnames(drug_use)
str(drug_use)
filas, columnas 17, 28**
Nombres de la columnas [1] “age” “n” “alcohol.use”
“alcohol.frequency”
[5] “marijuana.use” “marijuana.frequency” “cocaine.use”
“cocaine.frequency”
[9] “crack.use” “crack.frequency” “heroin.use” “heroin.frequency”
[13] “hallucinogen.use” “hallucinogen.frequency” “inhalant.use”
“inhalant.frequency”
[17] “pain.releiver.use” “pain.releiver.frequency” “oxycontin.use”
“oxycontin.frequency”
[21] “tranquilizer.use” “tranquilizer.frequency” “stimulant.use”
“stimulant.frequency”
[25] “meth.use” “meth.frequency” “sedative.use” “sedative.frequency”
Tipos de datos del dataframe (chr: carácter, int: entero,
num: numérico ‘data.frame’: 17 obs. of 28 variables:
$ age : chr “12” “13” “14” “15”
$ n : int 2798 2757 2792 2956 3058 3038 2469 2223 2271 2354
$ alcohol.use : num 3.9 8.5 18.1 29.2 40.1 49.3 58.7 64.6 69.7
83.2
$ alcohol.frequency : num 3 6 5 6 10 13 24 36 48 52
$ marijuana.use : num 1.1 3.4 8.7 14.5 22.5 28 33.7 33.4 34 33
$ marijuana.frequency : num 4 15 24 25 30 36 52 60 60 52
$ cocaine.use : num 0.1 0.1 0.1 0.5 1 2 3.2 4.1 4.9 4.8
$ cocaine.frequency : num 5 1 5.5 4 7 5 5 5.5 8 5
$ crack.use : num 0 0 0 0.1 0 0.1 0.4 0.5 0.6 0.5
$ crack.frequency : num NA 3 NA 9.5 1 21 10 2 5 17
$ heroin.use : num 0.1 0 0.1 0.2 0.1 0.1 0.4 0.5 0.9 0.6
$ heroin.frequency : num 35.5 NA 2 1 66.5 64 46 180 45 30
$ hallucinogen.use : num 0.2 0.6 1.6 2.1 3.4 4.8 7 8.6 7.4 6.3
$ hallucinogen.frequency : num 52 6 3 4 3 3 4 3 2 4 … $ inhalant.use :
num 1.6 2.5 2.6 2.5 3 2 1.8 1.4 1.5 1.4
$ inhalant.frequency : num 19 12 5 5.5 3 4 4 3 4 2
$ pain.releiver.use : num 2 2.4 3.9 5.5 6.2 8.5 9.2 9.4 10 9
$ pain.releiver.frequency: num 36 14 12 10 7 9 12 12 10 15
$ oxycontin.use : num 0.1 0.1 0.4 0.8 1.1 1.4 1.7 1.5 1.7 1.3
$ oxycontin.frequency : num 24.5 41 4.5 3 4 6 7 7.5 12 13.5
$ tranquilizer.use : num 0.2 0.3 0.9 2 2.4 3.5 4.9 4.2 5.4 3.9
$ tranquilizer.frequency : num 52 25.5 5 4.5 11 7 12 4.5 10 7
$ stimulant.use : num 0.2 0.3 0.8 1.5 1.8 2.8 3 3.3 4 4.1
$ stimulant.frequency : num 2 4 12 6 9.5 9 8 6 12 10
$ meth.use : num 0 0.1 0.1 0.3 0.3 0.6 0.5 0.4 0.9 0.6
$ meth.frequency : num NA 5 24 10.5 36 48 12 105 12 2
$ sedative.use : num 0.2 0.1 0.2 0.4 0.2 0.5 0.4 0.3 0.5 0.3
$ sedative.frequency : num 13 19 16.5 30 3 6.5 10 6 4 9
kable(colSums(is.na(drug_use)))
Datos nulos por columna
Parámetro | x |
---|---|
age | 0 |
n | 0 |
alcohol.use | 0 |
alcohol.frequency | 0 |
marijuana.use | 0 |
marijuana.frequency | 0 |
cocaine.use | 0 |
cocaine.frequency | 1 |
crack.use | 0 |
crack.frequency | 3 |
heroin.use | 0 |
heroin.frequency | 1 |
hallucinogen.use | 0 |
hallucinogen.frequency | 0 |
inhalant.use | 0 |
inhalant.frequency | 1 |
pain.releiver.use | 0 |
pain.releiver.frequency | 0 |
oxycontin.use | 0 |
oxycontin.frequency | 1 |
tranquilizer.use | 0 |
tranquilizer.frequency | 0 |
stimulant.use | 0 |
stimulant.frequency | 0 |
meth.use | 0 |
meth.frequency | 2 |
sedative.use | 0 |
sedative.frequency | 0 |
kable(summary(drug_use))
age | n | alcohol.use | alcohol.frequency | marijuana.use | marijuana.frequency | cocaine.use | cocaine.frequency | crack.use | crack.frequency | heroin.use | heroin.frequency | hallucinogen.use | hallucinogen.frequency | inhalant.use | inhalant.frequency | pain.releiver.use | pain.releiver.frequency | oxycontin.use | oxycontin.frequency | tranquilizer.use | tranquilizer.frequency | stimulant.use | stimulant.frequency | meth.use | meth.frequency | sedative.use | sedative.frequency | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Length:17 | Min. :2223 | Min. : 3.90 | Min. : 3.00 | Min. : 1.10 | Min. : 4.00 | Min. :0.000 | Min. : 1.000 | Min. :0.0000 | Min. : 1.00 | Min. :0.0000 | Min. : 1.00 | Min. :0.100 | Min. : 2.000 | Min. :0.000 | Min. : 2.000 | Min. : 0.600 | Min. : 7.00 | Min. :0.0000 | Min. : 3.00 | Min. :0.200 | Min. : 4.50 | Min. :0.000 | Min. : 2.00 | Min. :0.0000 | Min. : 2.00 | Min. :0.0000 | Min. : 3.00 | |
Class :character | 1st Qu.:2469 | 1st Qu.:40.10 | 1st Qu.:10.00 | 1st Qu.: 8.70 | 1st Qu.:30.00 | 1st Qu.:0.500 | 1st Qu.: 5.000 | 1st Qu.:0.0000 | 1st Qu.: 5.00 | 1st Qu.:0.1000 | 1st Qu.: 39.62 | 1st Qu.:0.600 | 1st Qu.: 3.000 | 1st Qu.:0.600 | 1st Qu.: 3.375 | 1st Qu.: 3.900 | 1st Qu.:12.00 | 1st Qu.:0.4000 | 1st Qu.: 5.75 | 1st Qu.:1.400 | 1st Qu.: 6.00 | 1st Qu.:0.600 | 1st Qu.: 7.00 | 1st Qu.:0.2000 | 1st Qu.: 12.00 | 1st Qu.:0.2000 | 1st Qu.: 6.50 | |
Mode :character | Median :2798 | Median :64.60 | Median :48.00 | Median :20.80 | Median :52.00 | Median :2.000 | Median : 5.250 | Median :0.4000 | Median : 7.75 | Median :0.2000 | Median : 53.75 | Median :3.200 | Median : 3.000 | Median :1.400 | Median : 4.000 | Median : 6.200 | Median :12.00 | Median :1.1000 | Median :12.00 | Median :3.500 | Median :10.00 | Median :1.800 | Median : 10.00 | Median :0.4000 | Median : 30.00 | Median :0.3000 | Median : 10.00 | |
NA | Mean :3251 | Mean :55.43 | Mean :33.35 | Mean :18.92 | Mean :42.94 | Mean :2.176 | Mean : 7.875 | Mean :0.2941 | Mean :15.04 | Mean :0.3529 | Mean : 73.28 | Mean :3.394 | Mean : 8.412 | Mean :1.388 | Mean : 6.156 | Mean : 6.271 | Mean :14.71 | Mean :0.9353 | Mean :14.81 | Mean :2.806 | Mean :11.74 | Mean :1.918 | Mean : 31.15 | Mean :0.3824 | Mean : 35.97 | Mean :0.2824 | Mean : 19.38 | |
NA | 3rd Qu.:3058 | 3rd Qu.:77.50 | 3rd Qu.:52.00 | 3rd Qu.:28.40 | 3rd Qu.:52.00 | 3rd Qu.:4.000 | 3rd Qu.: 7.250 | 3rd Qu.:0.5000 | 3rd Qu.:16.50 | 3rd Qu.:0.6000 | 3rd Qu.: 71.88 | 3rd Qu.:5.200 | 3rd Qu.: 4.000 | 3rd Qu.:2.000 | 3rd Qu.: 6.625 | 3rd Qu.: 9.000 | 3rd Qu.:15.00 | 3rd Qu.:1.4000 | 3rd Qu.:18.12 | 3rd Qu.:4.200 | 3rd Qu.:11.00 | 3rd Qu.:3.000 | 3rd Qu.: 12.00 | 3rd Qu.:0.6000 | 3rd Qu.: 47.00 | 3rd Qu.:0.4000 | 3rd Qu.: 17.50 | |
NA | Max. :7391 | Max. :84.20 | Max. :52.00 | Max. :34.00 | Max. :72.00 | Max. :4.900 | Max. :36.000 | Max. :0.6000 | Max. :62.00 | Max. :1.1000 | Max. :280.00 | Max. :8.600 | Max. :52.000 | Max. :3.000 | Max. :19.000 | Max. :10.000 | Max. :36.00 | Max. :1.7000 | Max. :46.00 | Max. :5.400 | Max. :52.00 | Max. :4.100 | Max. :364.00 | Max. :0.9000 | Max. :105.00 | Max. :0.5000 | Max. :104.00 | |
NA | NA | NA | NA | NA | NA | NA | NA’s :1 | NA | NA’s :3 | NA | NA’s :1 | NA | NA | NA | NA’s :1 | NA | NA | NA | NA’s :1 | NA | NA | NA | NA | NA | NA’s :2 | NA | NA |
Para las variables numéricas podemos utilizar los siguientes
tipos de gráficos:
Histogramas: Para visualizar la distribución de una variable.
Boxplots: Para representar la media, cuartiles y detectar valores
atípicos.
Correlación: Para evaluar relaciones entre variables numéricas.
En el caso de variables categóricas podemo usar:
Gráficos de barras y de pastel: Para visualizar la frecuencia de
categorías. Tablas de contingencia: Para ver relaciones entre variables
categóricas, en nuestro caso no aplica, ya que sólo hay un parámetro
categórico.
library(MASS)
data(UScereal)
View(UScereal)
#convertir en un nuevo dataframe por si cometo errores en la transformación de los datos
cereal <- UScereal
View cereal
#dejar sólo dos decimales, esto lo he hecho con chatgtp ya que no encontraba el método
cereal <- data.frame(lapply(cereal, function(x) if (is.numeric(x)) round(x, 2) else x))
kable(str(cereal))
‘data.frame’: 65 obs. of 11 variables:
$ mfr : Factor w/ 6 levels “G”,“K”,“N”,“P”,..: 3 2 2 1 2 1 6 4 5 1
…
$ calories : num 212 212 100 147 110 …
$ protein : num 12.12 12.12 8 2.67 2 …
$ fat : num 3.03 3.03 0 2.67 0 2.67 1.49 0 2.67 1.6 …
$ sodium : num 394 788 280 240 125 …
$ fibre : num 30.3 27.3 28 2 1 …
$ carbo : num 15.2 21.2 16 14 11 …
$ sugars : num 18.2 15.2 0 13.3 14 …
$ shelf : num 3 3 3 1 2 3 1 3 2 1 …
$ potassium: num 848.5 969.7 660 93.3 30 …
$ vitamins : Factor w/ 3 levels “100%”,“enriched”,..: 2 2 2 2 2 2 2 2 2
2 …
El parámetro “vitamins” puede cambiarse uno de las categorías desde “100%” a “full” para mayor homogeneidad de los datos
cereal$vitamins[is.na(cereal$vitamins)] <- "full"
El parámetro “shelf” puede cambiarse a categórico
cereal$shelf <- factor(cereal$shelf)
str(cereal)
Aquí ya aparece cambiado: shelf : Factor w/ 3 levels “1”,“2”,“3”: 3 3 3 1 2 3 1 3 2 1 …
# Buscamos la localización de la carpeta de trabajo
getwd()
# Esta es la carpeta de trabajo "C:/Users/UJA/Documents/R/LINKAGEDRS/PEC1"
# Guardamos el dataframe en formato csv y tabla
write.csv(cereal, "C:/Users/UJA/Documents/R/LINKAGEDRS/PEC1", quote = FALSE, row.names= FALSE)
write.table(cereal, "C:/Users/UJA/Documents/R/LINKAGEDRS/PEC1", quote = FALSE, row.names= FALSE)
*He seleccionado el paquete El paquete “cBioPortalData R” que da acceso a un conjunto de datos de estudio del Portal de Genómica del Cáncer de cBio. La referencia bibliográfica es > Ramos M, Geistlinger L, Oh S, Schiffer L, Azhar R, Kodali H, de Bruijn I, Gao J, Carey V, Morgan M, Waldron L (2020). “Integración multiómica de bases de datos oncológicas públicas en Bioconductor”. JCO Clinical Cancer Informatics , 1 (4), 958-971. [doi:10.1200/CCI.19.00119] , PMID: 33119407, [https://ascopubs.org/doi/pdf/10.1200/CCI.19.00119] . En concreto, este dataset tiene datos clínicos y genéticos de cáncer de mama, incluyendo carga mutacional del tumor, me ha parecido interesante. ### 3.2. Mostrad y describid la información más relevante de la estructura del dataset escogido.
# Estructura del dataset
kable(summary(brca_tcga_clinical_data))
studyId | patientId | sampleId | CANCER_TYPE | CANCER_TYPE_DETAILED | DAYS_TO_COLLECTION | FRACTION_GENOME_ALTERED | IS_FFPE | MUTATION_COUNT | OCT_EMBEDDED | ONCOTREE_CODE | OTHER_SAMPLE_ID | PATHOLOGY_REPORT_FILE_NAME | PATHOLOGY_REPORT_UUID | SAMPLE_INITIAL_WEIGHT | SAMPLE_TYPE | SAMPLE_TYPE_ID | SOMATIC_STATUS | TMB_NONSYNONYMOUS | VIAL_NUMBER | AGE | AJCC_METASTASIS_PATHOLOGIC_PM | AJCC_NODES_PATHOLOGIC_PN | AJCC_PATHOLOGIC_TUMOR_STAGE | AJCC_STAGING_EDITION | AJCC_TUMOR_PATHOLOGIC_PT | DAYS_TO_INITIAL_PATHOLOGIC_DIAGNOSIS | ER_STATUS_BY_IHC | ER_STATUS_IHC_PERCENT_POSITIVE | ETHNICITY | FORM_COMPLETION_DATE | HER2_CENT17_RATIO | HER2_FISH_STATUS | HER2_IHC_SCORE | HISTOLOGICAL_DIAGNOSIS | HISTORY_NEOADJUVANT_TRTYN | HISTORY_OTHER_MALIGNANCY | ICD_10 | ICD_O_3_HISTOLOGY | ICD_O_3_SITE | IHC_HER2 | INFORMED_CONSENT_VERIFIED | INITIAL_PATHOLOGIC_DX_YEAR | LYMPH_NODE_EXAMINED_COUNT | LYMPH_NODES_EXAMINED | MENOPAUSE_STATUS | METHOD_OF_INITIAL_SAMPLE_PROCUREMENT | NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT | OS_MONTHS | OS_STATUS | OTHER_PATIENT_ID | PATH_MARGIN | PHARMACEUTICAL_TX_ADJUVANT | PR_STATUS_BY_IHC | PR_STATUS_IHC_PERCENT_POSITIVE | PRIMARY_SITE_PATIENT | PROSPECTIVE_COLLECTION | RACE | RADIATION_TREATMENT_ADJUVANT | RETROSPECTIVE_COLLECTION | SAMPLE_COUNT | SEX | SITE_OF_TUMOR_TISSUE | STAGING_SYSTEM | SURGICAL_PROCEDURE_FIRST | TISSUE_SOURCE_SITE | TUMOR_STATUS | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Length:500 | Length:500 | Length:500 | Length:500 | Length:500 | Min. : 21.0 | Min. :0.0000 | Length:500 | Min. : 1.00 | Mode :logical | Length:500 | Length:500 | Length:500 | Length:500 | Min. : 5.0 | Length:500 | Min. :1.00 | Length:500 | Min. : 0.03333 | Length:500 | Min. :26.00 | Length:500 | Length:500 | Length:500 | Length:500 | Length:500 | Min. :0 | Length:500 | Length:500 | Length:500 | Length:500 | Min. : 0.800 | Length:500 | Min. :1.000 | Length:500 | Length:500 | Length:500 | Length:500 | Length:500 | Length:500 | Length:500 | Length:500 | Min. :1993 | Min. : 1.00 | Length:500 | Length:500 | Length:500 | Length:500 | Min. : 0.03 | Length:500 | Length:500 | Length:500 | Length:500 | Length:500 | Length:500 | Length:500 | Length:500 | Length:500 | Length:500 | Length:500 | Min. :1.000 | Length:500 | Length:500 | Length:500 | Length:500 | Length:500 | Length:500 | |
Class :character | Class :character | Class :character | Class :character | Class :character | 1st Qu.: 101.8 | 1st Qu.:0.1116 | Class :character | 1st Qu.: 18.00 | FALSE:202 | Class :character | Class :character | Class :character | Class :character | 1st Qu.: 130.0 | Class :character | 1st Qu.:1.00 | Class :character | 1st Qu.: 0.60000 | Class :character | 1st Qu.:49.00 | Class :character | Class :character | Class :character | Class :character | Class :character | 1st Qu.:0 | Class :character | Class :character | Class :character | Class :character | 1st Qu.: 1.090 | Class :character | 1st Qu.:1.000 | Class :character | Class :character | Class :character | Class :character | Class :character | Class :character | Class :character | Class :character | 1st Qu.:2007 | 1st Qu.: 3.00 | Class :character | Class :character | Class :character | Class :character | 1st Qu.: 16.32 | Class :character | Class :character | Class :character | Class :character | Class :character | Class :character | Class :character | Class :character | Class :character | Class :character | Class :character | 1st Qu.:1.000 | Class :character | Class :character | Class :character | Class :character | Class :character | Class :character | |
Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Median : 259.5 | Median :0.2274 | Mode :character | Median : 28.00 | TRUE :296 | Mode :character | Mode :character | Mode :character | Mode :character | Median : 210.0 | Mode :character | Median :1.00 | Mode :character | Median : 0.96667 | Mode :character | Median :58.00 | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Median :0 | Mode :character | Mode :character | Mode :character | Mode :character | Median : 1.200 | Mode :character | Median :1.000 | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Median :2010 | Median :10.00 | Mode :character | Mode :character | Mode :character | Mode :character | Median : 26.07 | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Median :1.000 | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | |
NA | NA | NA | NA | NA | Mean : 916.7 | Mean :0.2765 | NA | Mean : 56.65 | NA’s :2 | NA | NA | NA | NA | Mean : 295.9 | NA | Mean :1.02 | NA | Mean : 2.00042 | NA | Mean :58.81 | NA | NA | NA | NA | NA | Mean :0 | NA | NA | NA | NA | Mean : 1.782 | NA | Mean :1.631 | NA | NA | NA | NA | NA | NA | NA | NA | Mean :2009 | Mean :10.62 | NA | NA | NA | NA | Mean : 38.63 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | Mean :1.008 | NA | NA | NA | NA | NA | NA | |
NA | NA | NA | NA | NA | 3rd Qu.:1373.5 | 3rd Qu.:0.4145 | NA | 3rd Qu.: 52.00 | NA | NA | NA | NA | NA | 3rd Qu.: 377.5 | NA | 3rd Qu.:1.00 | NA | 3rd Qu.: 1.84167 | NA | 3rd Qu.:68.00 | NA | NA | NA | NA | NA | 3rd Qu.:0 | NA | NA | NA | NA | 3rd Qu.: 1.700 | NA | 3rd Qu.:2.000 | NA | NA | NA | NA | NA | NA | NA | NA | 3rd Qu.:2011 | 3rd Qu.:15.00 | NA | NA | NA | NA | 3rd Qu.: 50.84 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 3rd Qu.:1.000 | NA | NA | NA | NA | NA | NA | |
NA | NA | NA | NA | NA | Max. :6583.0 | Max. :0.9971 | NA | Max. :3426.00 | NA | NA | NA | NA | NA | Max. :1650.0 | NA | Max. :6.00 | NA | Max. :117.13333 | NA | Max. :90.00 | NA | NA | NA | NA | NA | Max. :0 | NA | NA | NA | NA | Max. :16.300 | NA | Max. :3.000 | NA | NA | NA | NA | NA | NA | NA | NA | Max. :2013 | Max. :44.00 | NA | NA | NA | NA | Max. :216.59 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | Max. :2.000 | NA | NA | NA | NA | NA | NA | |
NA | NA | NA | NA | NA | NA’s :4 | NA’s :4 | NA | NA’s :20 | NA | NA | NA | NA | NA | NA’s :2 | NA | NA | NA | NA’s :20 | NA | NA’s :2 | NA | NA | NA | NA | NA | NA’s :2 | NA | NA | NA | NA | NA’s :393 | NA | NA’s :267 | NA | NA | NA | NA | NA | NA | NA | NA | NA’s :4 | NA’s :23 | NA | NA | NA | NA | NA’s :2 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
# Simplificar el nombre del dataset y eliminar columnas con datos no relevantes
brca <- brca_tcga_clinical_data
brca$PATHOLOGY_REPORT_FILE_NAME <- NULL
brca$PATHOLOGY_REPORT_UUID <- NULL
brca$OTHER_SAMPLE_ID <- NULL
brca$studyId <- NULL
brca$VIAL_NUMBER <- NULL
brca$METHOD_OF_INITIAL_SAMPLE_PROCUREMENT <- NULL
brca$OTHER_PATIENT_ID <-NULL
brca$sampleId <- NULL
# Eliminar los tumores de piel dejando sólo los de mama
brca <- brca %>% filter(CANCER_TYPE != "Skin Cancer, Non-Melanoma")
# Tabla resumen de los datos
Kable(summary(brca))
patientId | CANCER_TYPE | CANCER_TYPE_DETAILED | DAYS_TO_COLLECTION | FRACTION_GENOME_ALTERED | IS_FFPE | MUTATION_COUNT | OCT_EMBEDDED | ONCOTREE_CODE | SAMPLE_INITIAL_WEIGHT | SAMPLE_TYPE | SAMPLE_TYPE_ID | SOMATIC_STATUS | TMB_NONSYNONYMOUS | AGE | AJCC_METASTASIS_PATHOLOGIC_PM | AJCC_NODES_PATHOLOGIC_PN | AJCC_PATHOLOGIC_TUMOR_STAGE | AJCC_STAGING_EDITION | AJCC_TUMOR_PATHOLOGIC_PT | DAYS_TO_INITIAL_PATHOLOGIC_DIAGNOSIS | ER_STATUS_BY_IHC | ER_STATUS_IHC_PERCENT_POSITIVE | ETHNICITY | FORM_COMPLETION_DATE | HER2_CENT17_RATIO | HER2_FISH_STATUS | HER2_IHC_SCORE | HISTOLOGICAL_DIAGNOSIS | HISTORY_NEOADJUVANT_TRTYN | HISTORY_OTHER_MALIGNANCY | ICD_10 | ICD_O_3_HISTOLOGY | ICD_O_3_SITE | IHC_HER2 | INFORMED_CONSENT_VERIFIED | INITIAL_PATHOLOGIC_DX_YEAR | LYMPH_NODE_EXAMINED_COUNT | LYMPH_NODES_EXAMINED | MENOPAUSE_STATUS | NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT | OS_MONTHS | OS_STATUS | PATH_MARGIN | PHARMACEUTICAL_TX_ADJUVANT | PR_STATUS_BY_IHC | PR_STATUS_IHC_PERCENT_POSITIVE | PRIMARY_SITE_PATIENT | PROSPECTIVE_COLLECTION | RACE | RADIATION_TREATMENT_ADJUVANT | RETROSPECTIVE_COLLECTION | SAMPLE_COUNT | SEX | SITE_OF_TUMOR_TISSUE | STAGING_SYSTEM | SURGICAL_PROCEDURE_FIRST | TISSUE_SOURCE_SITE | TUMOR_STATUS | ratio1 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Length:499 | Length:499 | Length:499 | Min. : 21.0 | Min. :0.0000 | Length:499 | Min. : 1.00 | Mode :logical | Length:499 | Min. : 5.0 | Length:499 | Min. :1.00 | Length:499 | Min. : 0.03333 | Min. :26.00 | Length:499 | Length:499 | Length:499 | Length:499 | Length:499 | Min. :0 | Length:499 | Length:499 | Length:499 | Length:499 | Min. : 0.800 | Length:499 | Min. :1.000 | Length:499 | Length:499 | Length:499 | Length:499 | Length:499 | Length:499 | Length:499 | Length:499 | Min. :1993 | Min. : 1.00 | Length:499 | Length:499 | Length:499 | Min. : 0.03 | Length:499 | Length:499 | Length:499 | Length:499 | Length:499 | Length:499 | Length:499 | Length:499 | Length:499 | Length:499 | Min. :1.000 | Length:499 | Length:499 | Length:499 | Length:499 | Length:499 | Length:499 | Min. : 0.01471 | |
Class :character | Class :character | Class :character | 1st Qu.: 101.5 | 1st Qu.:0.1115 | Class :character | 1st Qu.: 18.00 | FALSE:202 | Class :character | 1st Qu.: 130.0 | Class :character | 1st Qu.:1.00 | Class :character | 1st Qu.: 0.60000 | 1st Qu.:49.00 | Class :character | Class :character | Class :character | Class :character | Class :character | 1st Qu.:0 | Class :character | Class :character | Class :character | Class :character | 1st Qu.: 1.090 | Class :character | 1st Qu.:1.000 | Class :character | Class :character | Class :character | Class :character | Class :character | Class :character | Class :character | Class :character | 1st Qu.:2007 | 1st Qu.: 3.00 | Class :character | Class :character | Class :character | 1st Qu.: 16.29 | Class :character | Class :character | Class :character | Class :character | Class :character | Class :character | Class :character | Class :character | Class :character | Class :character | 1st Qu.:1.000 | Class :character | Class :character | Class :character | Class :character | Class :character | Class :character | 1st Qu.: 0.31017 | |
Mode :character | Mode :character | Mode :character | Median : 259.0 | Median :0.2262 | Mode :character | Median : 28.00 | TRUE :295 | Mode :character | Median : 210.0 | Mode :character | Median :1.00 | Mode :character | Median : 0.96667 | Median :58.00 | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Median :0 | Mode :character | Mode :character | Mode :character | Mode :character | Median : 1.200 | Mode :character | Median :1.500 | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Median :2010 | Median :10.00 | Mode :character | Mode :character | Mode :character | Median : 26.12 | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Median :1.000 | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character | Median : 0.48387 | |
NA | NA | NA | Mean : 917.6 | Mean :0.2761 | NA | Mean : 56.65 | NA’s :2 | NA | Mean : 296.3 | NA | Mean :1.02 | NA | Mean : 2.00042 | Mean :58.81 | NA | NA | NA | NA | NA | Mean :0 | NA | NA | NA | NA | Mean : 1.782 | NA | Mean :1.634 | NA | NA | NA | NA | NA | NA | NA | NA | Mean :2009 | Mean :10.64 | NA | NA | NA | Mean : 38.66 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | Mean :1.008 | NA | NA | NA | NA | NA | NA | Mean : 0.93050 | |
NA | NA | NA | 3rd Qu.:1375.0 | 3rd Qu.:0.4143 | NA | 3rd Qu.: 52.00 | NA | NA | 3rd Qu.: 380.0 | NA | 3rd Qu.:1.00 | NA | 3rd Qu.: 1.84167 | 3rd Qu.:68.00 | NA | NA | NA | NA | NA | 3rd Qu.:0 | NA | NA | NA | NA | 3rd Qu.: 1.700 | NA | 3rd Qu.:2.000 | NA | NA | NA | NA | NA | NA | NA | NA | 3rd Qu.:2011 | 3rd Qu.:15.00 | NA | NA | NA | 3rd Qu.: 50.85 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 3rd Qu.:1.000 | NA | NA | NA | NA | NA | NA | 3rd Qu.: 0.90693 | |
NA | NA | NA | Max. :6583.0 | Max. :0.9971 | NA | Max. :3426.00 | NA | NA | Max. :1650.0 | NA | Max. :6.00 | NA | Max. :117.13333 | Max. :90.00 | NA | NA | NA | NA | NA | Max. :0 | NA | NA | NA | NA | Max. :16.300 | NA | Max. :3.000 | NA | NA | NA | NA | NA | NA | NA | NA | Max. :2013 | Max. :44.00 | NA | NA | NA | Max. :216.59 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | Max. :2.000 | NA | NA | NA | NA | NA | NA | Max. :38.06667 | |
NA | NA | NA | NA’s :4 | NA’s :4 | NA | NA’s :19 | NA | NA | NA’s :2 | NA | NA | NA | NA’s :19 | NA’s :2 | NA | NA | NA | NA | NA | NA’s :2 | NA | NA | NA | NA | NA’s :392 | NA | NA’s :267 | NA | NA | NA | NA | NA | NA | NA | NA | NA’s :4 | NA’s :23 | NA | NA | NA | NA’s :2 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA’s :20 |
# Crear una nueva variable denominada ratio1= Número de mutaciones/edad
brca$ratio1 <- brca$MUTATION_COUNT / brca$AGE
# Comprobar que la nueva variable funciona
kable(fivenum(brca$ratio1))
brca$RACE[brca$RACE == "WHITE"] <- "1"
brca$RACE[brca$RACE == "BLACK OR AFRICAN AMERICAN"] <- 2
brca$RACE[brca$RACE == "ASIAN"] <- "3"
# Determinar los tipos de tumores incluidos en el estudio
Kable(unique(brca$CANCER_TYPE_DETAILED))
Resultado
x |
---|
Breast Invasive Lobular Carcinoma |
Breast Invasive Ductal Carcinoma |
Paget Disease of the Nipple |
Metaplastic Breast Cancer |
Breast Invasive Mixed Mucinous Carcinoma |
Breast Mixed Ductal and Lobular Carcinoma |
Breast Invasive Carcinoma, NOS |
Invasive Breast Carcinoma |
Solid Papillary Carcinoma of the Breast |
Adenoid Cystic Breast Cancer |
Malignant Phyllodes Tumor of the Breast |
Breast |
# Separar en un fichero los datos de los pacientes con tumores de tipo "Breast Invasive Lobular Carcinoma" y en otro fichero el resto
lobular <- subset(brca, CANCER_TYPE_DETAILED== "Breast Invasive Lobular Carcinoma" )
View(lobular)
non_lobular <- subset(brca, CANCER_TYPE_DETAILED != "Breast Invasive Lobular Carcinoma")
View(non_lobular)
fusion <- rbind(lobular, non_lobular)
View(fusion)
# Exportarlo a csv
#
healthy_life_years_1 <- read_excel("healthy-life-years-1.xlsx")
> years <- healthy_life_years_1
> View(years)
# Cambiar el nombre de las columnas ya que tienen números y da problemas en R
> colnames(years)[colnames(years) == "2011"] <- "Year_2011"
# Convertir las columnas con números guardados como caracteres en números
years$Year_2011 <- as.numeric(years$Year_2011)
# Comprobar que se han convertido adecuadamente
str(years)
tibble [37 × 13] (S3: tbl_df/tbl/data.frame)
$ TIME : chr [1:37] “Belgium” “Bulgaria” “Czechia” “Denmark” …
$ Year_2011: num [1:37] 63.5 64 62.9 61.8 58.2 56.1 67.2 66.6 65.5 63.1
…
$ Year_2012: num [1:37] 64.6 63.9 63.2 61.4 57.6 55.1 67.2 64.9 65.3
63.2 …
$ Year_2013: num [1:37] 63.9 64.5 63.3 60.5 57.4 55.5 66.9 64.9 64.3
63.6 …
$ Year_2014: num [1:37] 64.1 64 64.1 60.9 56.5 55.2 66.9 64.5 65 63.8
…
$ Year_2015: num [1:37] 64.2 63.2 63 59.1 66.4 55 67.2 64 64 63.6
…
$ Year_2016: num [1:37] 63.7 65.7 63.3 60.3 66.4 56.8 68.5 64.3 66.2
63.4 …
$ Year_2017: num [1:37] 63.7 64.5 61.4 59.7 66 56 68.6 64.8 69.4 63.7
…
$ Year_2018: num [1:37] 63.4 65.8 62.7 60.9 65.8 53.9 69.4 65.4 68 63.9
…
$ Year_2019: num [1:37] 62.4 66.3 62 58.9 66.3 55.8 69.6 66 69.9 64.1
…
$ Year_2020: num [1:37] 63.8 65.6 61.6 58 65.7 57.6 66.2 65.9 66.3 64.6
…
$ Year_2021: num [1:37] 64.6 63.3 62 56.6 65.6 56.5 67.2 65.6 62.8 66.2
…
$ Year_2022: num [1:37] 63.7 66.7 61.8 55.9 61.1 59.3 66 67 61.2 64.4
…
# Contar los valores perdidos por columna
Kable(colSums(is.na(years)))
Resultados
x | |
---|---|
TIME | 0 |
Year_2011 | 6 |
Year_2012 | 7 |
Year_2013 | 7 |
Year_2014 | 6 |
Year_2015 | 6 |
Year_2016 | 6 |
Year_2017 | 6 |
Year_2018 | 6 |
Year_2019 | 8 |
Year_2020 | 8 |
Year_2021 | 9 |
Year_2022 | 8 |
# Crear una nueva columna con los datos perdidos de por fila
years$NA_Count <- rowSums(is.na(years))
# Mostrar los primeros valores
Kable(tails(years))
kable (head(years))
Resultados
TIME | Year_2011 | Year_2012 | Year_2013 | Year_2014 | Year_2015 | Year_2016 | Year_2017 | Year_2018 | Year_2019 | Year_2020 | Year_2021 | Year_2022 | NA_Count |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Belgium | 63.5 | 64.6 | 63.9 | 64.1 | 64.2 | 63.7 | 63.7 | 63.4 | 62.4 | 63.8 | 64.6 | 63.7 | 0 |
Bulgaria | 64.0 | 63.9 | 64.5 | 64.0 | 63.2 | 65.7 | 64.5 | 65.8 | 66.3 | 65.6 | 63.3 | 66.7 | 0 |
Czechia | 62.9 | 63.2 | 63.3 | 64.1 | 63.0 | 63.3 | 61.4 | 62.7 | 62.0 | 61.6 | 62.0 | 61.8 | 0 |
Denmark | 61.8 | 61.4 | 60.5 | 60.9 | 59.1 | 60.3 | 59.7 | 60.9 | 58.9 | 58.0 | 56.6 | 55.9 | 0 |
Germany | 58.2 | 57.6 | 57.4 | 56.5 | 66.4 | 66.4 | 66.0 | 65.8 | 66.3 | 65.7 | 65.6 | 61.1 | 0 |
Estonia | 56.1 | 55.1 | 55.5 | 55.2 | 55.0 | 56.8 | 56.0 | 53.9 | 55.8 | 57.6 | 56.5 | 59.3 | 0 |
TIME | Year_2011 | Year_2012 | Year_2013 | Year_2014 | Year_2015 | Year_2016 | Year_2017 | Year_2018 | Year_2019 | Year_2020 | Year_2021 | Year_2022 | NA_Count |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
United Kingdom | 65.2 | 64.5 | 64.6 | 63.8 | 63.5 | 63.1 | 62.7 | 61.2 | NA | NA | NA | NA | 4 |
Montenegro | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 12 |
North Macedonia | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 12 |
Albania | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 12 |
Serbia | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 12 |
Türkiye | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 12 |
mean(years$Year_2011, na.rm = TRUE)
Resultado [1] 62.16452
years$Maximum <- do.call(pmax, c(years[, -1], na.rm = TRUE)) View(years)
### 4.4. Definid un data frame, years2, que muestre los datos para los
países que cumplan 2 condiciones: (i) tienen menos de 2 datos faltantes
y (ii) su valor de max_years es estrictamente superior a la media
aritmética de max_years (calculada considerando todos los países con
almenos un valor no faltante).# Paso 1: Calcular el número de valores
perdidos por fila
# Paso 1: Calcular el número de valores perdidos por fila
years <- years %>% mutate(NA_Count = rowSums(is.na(.)))
# Paso 2: Calcular el valor máximo de cada fila ignorando NA
years <- years %>% mutate(max_years = apply(.[, -c(1, ncol(.))], 1, max, na.rm = TRUE))
# Paso 3: Calcular la media de `max_years` considerando solo filas con al menos un valor no faltante
media_max_years <- mean(years$max_years, na.rm = TRUE)
# Paso 4: Filtrar países con menos de 2 datos faltantes y `max_years` > media
years2 <- years %>% filter(NA_Count < 2, max_years > media_max_years) %>% select(-NA_Count)
# Eliminar la columna de NA_Count si no la necesitamos
kable(head(years2))
Resultados
TIME | Year_2011 | Year_2012 | Year_2013 | Year_2014 | Year_2015 | Year_2016 | Year_2017 | Year_2018 | Year_2019 | Year_2020 | Year_2021 | Year_2022 | Maximum | max_years |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Belgium | 63.5 | 64.6 | 63.9 | 64.1 | 64.2 | 63.7 | 63.7 | 63.4 | 62.4 | 63.8 | 64.6 | 63.7 | 64.6 | 64.6 |
Bulgaria | 64.0 | 63.9 | 64.5 | 64.0 | 63.2 | 65.7 | 64.5 | 65.8 | 66.3 | 65.6 | 63.3 | 66.7 | 66.7 | 66.7 |
Czechia | 62.9 | 63.2 | 63.3 | 64.1 | 63.0 | 63.3 | 61.4 | 62.7 | 62.0 | 61.6 | 62.0 | 61.8 | 64.1 | 64.1 |
Denmark | 61.8 | 61.4 | 60.5 | 60.9 | 59.1 | 60.3 | 59.7 | 60.9 | 58.9 | 58.0 | 56.6 | 55.9 | 61.8 | 61.8 |
Germany | 58.2 | 57.6 | 57.4 | 56.5 | 66.4 | 66.4 | 66.0 | 65.8 | 66.3 | 65.7 | 65.6 | 61.1 | 66.4 | 66.4 |
Estonia | 56.1 | 55.1 | 55.5 | 55.2 | 55.0 | 56.8 | 56.0 | 53.9 | 55.8 | 57.6 | 56.5 | 59.3 | 59.3 | 59.3 |
years_matriz<- as.matrix(years[, c("Year_2011", "Year_2022")])
View(years_matriz)
Respuesta a la pregunta Es conveniente usar una matriz para datos que sean todos del mismo tipo (numéricos o caracteres), se requiera cálculos rápidos o para grandes volúmenes de datos. El dataframe es mejor para conjunto de datos combinados (caracteres y numéricos) y que se necesiten filtrar.
View(drug_use)
Kable(summary(drug_use))
age | n | alcohol.use | alcohol.frequency | marijuana.use | marijuana.frequency | cocaine.use | cocaine.frequency | crack.use | crack.frequency | heroin.use | heroin.frequency | hallucinogen.use | hallucinogen.frequency | inhalant.use | inhalant.frequency | pain.releiver.use | pain.releiver.frequency | oxycontin.use | oxycontin.frequency | tranquilizer.use | tranquilizer.frequency | stimulant.use | stimulant.frequency | meth.use | meth.frequency | sedative.use | sedative.frequency | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Length:17 | Min. :2223 | Min. : 3.90 | Min. : 3.00 | Min. : 1.10 | Min. : 4.00 | Min. :0.000 | Min. : 1.000 | Min. :0.0000 | Min. : 1.00 | Min. :0.0000 | Min. : 1.00 | Min. :0.100 | Min. : 2.000 | Min. :0.000 | Min. : 2.000 | Min. : 0.600 | Min. : 7.00 | Min. :0.0000 | Min. : 3.00 | Min. :0.200 | Min. : 4.50 | Min. :0.000 | Min. : 2.00 | Min. :0.0000 | Min. : 2.00 | Min. :0.0000 | Min. : 3.00 | |
Class :character | 1st Qu.:2469 | 1st Qu.:40.10 | 1st Qu.:10.00 | 1st Qu.: 8.70 | 1st Qu.:30.00 | 1st Qu.:0.500 | 1st Qu.: 5.000 | 1st Qu.:0.0000 | 1st Qu.: 5.00 | 1st Qu.:0.1000 | 1st Qu.: 39.62 | 1st Qu.:0.600 | 1st Qu.: 3.000 | 1st Qu.:0.600 | 1st Qu.: 3.375 | 1st Qu.: 3.900 | 1st Qu.:12.00 | 1st Qu.:0.4000 | 1st Qu.: 5.75 | 1st Qu.:1.400 | 1st Qu.: 6.00 | 1st Qu.:0.600 | 1st Qu.: 7.00 | 1st Qu.:0.2000 | 1st Qu.: 12.00 | 1st Qu.:0.2000 | 1st Qu.: 6.50 | |
Mode :character | Median :2798 | Median :64.60 | Median :48.00 | Median :20.80 | Median :52.00 | Median :2.000 | Median : 5.250 | Median :0.4000 | Median : 7.75 | Median :0.2000 | Median : 53.75 | Median :3.200 | Median : 3.000 | Median :1.400 | Median : 4.000 | Median : 6.200 | Median :12.00 | Median :1.1000 | Median :12.00 | Median :3.500 | Median :10.00 | Median :1.800 | Median : 10.00 | Median :0.4000 | Median : 30.00 | Median :0.3000 | Median : 10.00 | |
NA | Mean :3251 | Mean :55.43 | Mean :33.35 | Mean :18.92 | Mean :42.94 | Mean :2.176 | Mean : 7.875 | Mean :0.2941 | Mean :15.04 | Mean :0.3529 | Mean : 73.28 | Mean :3.394 | Mean : 8.412 | Mean :1.388 | Mean : 6.156 | Mean : 6.271 | Mean :14.71 | Mean :0.9353 | Mean :14.81 | Mean :2.806 | Mean :11.74 | Mean :1.918 | Mean : 31.15 | Mean :0.3824 | Mean : 35.97 | Mean :0.2824 | Mean : 19.38 | |
NA | 3rd Qu.:3058 | 3rd Qu.:77.50 | 3rd Qu.:52.00 | 3rd Qu.:28.40 | 3rd Qu.:52.00 | 3rd Qu.:4.000 | 3rd Qu.: 7.250 | 3rd Qu.:0.5000 | 3rd Qu.:16.50 | 3rd Qu.:0.6000 | 3rd Qu.: 71.88 | 3rd Qu.:5.200 | 3rd Qu.: 4.000 | 3rd Qu.:2.000 | 3rd Qu.: 6.625 | 3rd Qu.: 9.000 | 3rd Qu.:15.00 | 3rd Qu.:1.4000 | 3rd Qu.:18.12 | 3rd Qu.:4.200 | 3rd Qu.:11.00 | 3rd Qu.:3.000 | 3rd Qu.: 12.00 | 3rd Qu.:0.6000 | 3rd Qu.: 47.00 | 3rd Qu.:0.4000 | 3rd Qu.: 17.50 | |
NA | Max. :7391 | Max. :84.20 | Max. :52.00 | Max. :34.00 | Max. :72.00 | Max. :4.900 | Max. :36.000 | Max. :0.6000 | Max. :62.00 | Max. :1.1000 | Max. :280.00 | Max. :8.600 | Max. :52.000 | Max. :3.000 | Max. :19.000 | Max. :10.000 | Max. :36.00 | Max. :1.7000 | Max. :46.00 | Max. :5.400 | Max. :52.00 | Max. :4.100 | Max. :364.00 | Max. :0.9000 | Max. :105.00 | Max. :0.5000 | Max. :104.00 | |
NA | NA | NA | NA | NA | NA | NA | NA’s :1 | NA | NA’s :3 | NA | NA’s :1 | NA | NA | NA | NA’s :1 | NA | NA | NA | NA’s :1 | NA | NA | NA | NA | NA | NA’s :2 | NA | NA |
Llama la atención el inicio temprano en el consumo de algunas drogas. También que la droga más consumida es el alcohol y también la que tiene una frecuencia de uso muy alta. Contrasta con la heroina con una tasa baja pero con una alta frecuencia de uso. ### 5.2. Realizad un diagrama de cajas sobre las variables alcohol.use, marijuana.use, y cocaine.use. ¿Qué conclusiones se pueden extraer?
boxplot(drug_use$alcohol.use, col="tomato", main="alcohol use")
boxplot(drug_use$marijuana.use, col="blue", main= "marijuana use")
boxplot(drug_use$cocaine.use, col="grey", main="cocaine use")
Sin embargo, estos gráficos no me convencen ya que no reflejan la edad en el eje Y. Por ello, he recurrido a esta otra estrategia
boxplot(alcohol.use ~ age, data = drug_use, col = "lightblue", main = "Consumo de Alcohol por Edad", xlab = "age", ylab = "alcohol.use")
boxplot(marijuana.use ~ age, data = drug_use, col = "red", main = "Consumo de Cannabis por Edad", xlab = "age", ylab = "marijuana.use")
boxplot(cocaine.use ~ age, data = drug_use, col = "green", main = "Consumo de Cocaina por Edad", xlab = "age", ylab = "cocaina.use")
Resultados
El cannabis es más
consumido por gente joven, siendo el alcohol más usado por personas
mayores. El uso de la cocaina está retardado en cuanto al cannabis.
Tanto el alcohol como el cannabis comienzan a usarse a edades
tempranas.
# Crear un fichero nuevo que contenga sólo las tres drogas de interés
drug.selection <- drug_use(c("crack.use", "heroin.use", "hallucinogen.use"))
# Hacer una matriz de correlación entre ellas
pairs(drug.selection)
# Inspección visual de los gráficos de correlación
# Correlación entre el uso de heroina y crack
kable(cor(drug.selection$crack.use, drug.selection$heroin.use))
Correlación 0.7507903
# Correlación entre el uso de crack y alucinógenos
kable(cor(drug.selection$crack.use, drug.selection$hallucinogen.use))
Correlación: 0.5562225
# Correlación entre uso de heroina y alucinógenos
kable(cor(drug.selection$heroin.use, drug.selection$hallucinogen.use))
Correlación: 0.683673
Resultados Observamos cierta correlación entre el uso de heroina y crack, algo menos significativa es la correlación de heroína con alucinógenos.
data("UScereal")
View(UScereal)
kable(table(UScereal$mfr))
kable(table(UScereal$shelf))
kable(prop.table(table(UScereal$mfr)))
kable(prop.table(table(UScereal$shelf)))
Tabla de frecuencias absolutas de mfr
Var1 | Freq |
---|---|
G | 22 |
K | 21 |
N | 3 |
P | 9 |
Q | 5 |
R | 5 |
Tabla de frecuencias absolutas de shelf
Var1 | Freq |
---|---|
1 | 18 |
2 | 18 |
3 | 29 |
Tabla de frecuencias relativas de mfr
Var1 | Freq |
---|---|
G | 0.3384615 |
K | 0.3230769 |
N | 0.0461538 |
P | 0.1384615 |
Q | 0.0769231 |
R | 0.0769231 |
Tabla de frecuencias relativas de shelf
Var1 | Freq |
---|---|
1 | 0.2769231 |
2 | 0.2769231 |
3 | 0.4461538 |
years_complete <- na.omit(years)
# Convertir los datos en formato largo
years_long <- years_complete %>% mutate(Año = as.numeric(gsub("Year_", "", Año)))
years_pure$NA_count <-NULL
years_pure$NA_count <- NULL
years_pure$NA_Count <-NULL
years_pures$NA_Count <- NULL
# Seleccionar los 10 primeros
df_top10 <- df_long %>% filter(COUNTRY %in% unique(COUNTRY)[1:10])
# Gráfico apilado
ggplot(df_top10, aes(x = Año, y = Edad_Libre_Enfermedad, color = COUNTRY, group = COUNTRY)) +
geom_line(size = 1) +
geom_point() +
labs(title = "Evolución de la Edad Libre de Enfermedad (2011 - 2022) (Top 10 Países)",
x = "Año", y = "Edad Libre de Enfermedad (años)") +
theme_minimal() +
theme(legend.position = "right")
Resultado
# Crear un nuevo dataframe denominado "df2022" que tiene sólo los datos del 2022 por países
df_2022 <- years_pure %>% select(COUNTRY, Year_2022)
# He considerado que el gráfico que mejor puede representar los datos es uno de barras
ggplot(df2022, aes(x = reorder(País, Año_2022), y = Año_2022, fill = País)) +
geom_bar(stat = "identity", show.legend = FALSE) +
labs(title = "Edad Libre de Enfermedad en 2022 por País",
x = "País", y = "Años Libres de Enfermedad") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Resultados
Nota He tenido que recurrir a chatgtp para hacer esta última parte, aún así me ha costado mucho trabajo poder terminarlo.