Data yang digunakan : Data HCC Survival
Memanggil Package yang dibutuhkan :
library(readxl)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.4 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
library(imputeMissings)
##
## Attaching package: 'imputeMissings'
## The following object is masked from 'package:dplyr':
##
## compute
library(ggplot2)
library(ggcorrplot)
HCCKategorik=read_excel("D:\\TINGKAT IV\\5 - DATA MINING\\TUGAS\\Data HCC Survival 4SE1.xlsx",sheet="Data Kategorik")
glimpse(HCCKategorik)
## Rows: 165
## Columns: 28
## $ Patients <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, ~
## $ Gender <dbl> 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~
## $ Symptoms <chr> "0", "?", "0", "1", "1", "0", "0", "1",~
## $ Alcohol <dbl> 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, ~
## $ `Hepatitis B Surface Antigen` <chr> "0", "0", "1", "0", "1", "0", "0", "0",~
## $ `Hepatitis B e Antigen` <chr> "0", "0", "0", "0", "0", "?", "?", "?",~
## $ `Hepatitis B Core Antibody` <chr> "0", "0", "1", "0", "1", "0", "1", "0",~
## $ `Hepatitis C Virus Antibody` <chr> "0", "1", "0", "0", "0", "0", "1", "0",~
## $ Cirrhosis <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~
## $ `Endemic Countries` <chr> "0", "?", "0", "0", "0", "0", "0", "0",~
## $ Smoking <chr> "1", "?", "1", "1", "1", "?", "0", "1",~
## $ Diabetes <chr> "1", "1", "0", "1", "0", "0", "1", "1",~
## $ Obesity <chr> "?", "0", "0", "0", "0", "1", "0", "?",~
## $ Hemochromatosis <chr> "1", "0", "0", "0", "0", "0", "?", "0",~
## $ `Arterial Hypertension` <chr> "0", "1", "1", "1", "1", "0", "0", "0",~
## $ `Chronic Renal Insufficiency` <chr> "0", "0", "1", "0", "1", "0", "0", "0",~
## $ `Human Immunodeficiency Virus` <chr> "0", "0", "0", "0", "0", "0", "0", "0",~
## $ `Nonalcoholic Steatohepatitis` <chr> "0", "0", "0", "0", "0", "0", "0", "0",~
## $ `Esophageal Varices` <chr> "1", "1", "0", "0", "0", "1", "0", "0",~
## $ Splenomegaly <chr> "0", "0", "0", "0", "0", "1", "0", "1",~
## $ `Portal Hypertension` <chr> "0", "0", "1", "0", "0", "1", "0", "1",~
## $ `Portal Vein Thrombosis` <chr> "0", "0", "0", "0", "0", "0", "0", "1",~
## $ `Liver Metastasis: nominal` <chr> "0", "0", "1", "1", "0", "0", "0", "0",~
## $ `Radiological Hallmark` <chr> "1", "1", "1", "1", "1", "1", "1", "1",~
## $ `Performance Status` <dbl> 0, 0, 2, 0, 0, 1, 0, 3, 1, 0, 0, 0, 0, ~
## $ `Encefalopathy degree` <chr> "1", "1", "1", "1", "1", "1", "1", "1",~
## $ `Ascites degree` <chr> "1", "1", "2", "1", "1", "2", "1", "1",~
## $ Class <dbl> 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, ~
HCCKategorik<-HCCKategorik %>% mutate_if(is.numeric,as.factor)
HCCKategorik<-HCCKategorik %>% mutate_if(is.character,as.factor)
glimpse(HCCKategorik)
## Rows: 165
## Columns: 28
## $ Patients <fct> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, ~
## $ Gender <fct> 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~
## $ Symptoms <fct> 0, ?, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, ~
## $ Alcohol <fct> 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, ~
## $ `Hepatitis B Surface Antigen` <fct> 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ~
## $ `Hepatitis B e Antigen` <fct> 0, 0, 0, 0, 0, ?, ?, ?, 0, 0, 0, 0, 0, ~
## $ `Hepatitis B Core Antibody` <fct> 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, ~
## $ `Hepatitis C Virus Antibody` <fct> 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, ~
## $ Cirrhosis <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~
## $ `Endemic Countries` <fct> 0, ?, 0, 0, 0, 0, 0, 0, 0, 0, ?, 1, 0, ~
## $ Smoking <fct> 1, ?, 1, 1, 1, ?, 0, 1, 1, 0, ?, 0, 1, ~
## $ Diabetes <fct> 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, ~
## $ Obesity <fct> ?, 0, 0, 0, 0, 1, 0, ?, 0, 0, 0, 0, 0, ~
## $ Hemochromatosis <fct> 1, 0, 0, 0, 0, 0, ?, 0, 0, 1, 0, 0, 0, ~
## $ `Arterial Hypertension` <fct> 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, ~
## $ `Chronic Renal Insufficiency` <fct> 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ~
## $ `Human Immunodeficiency Virus` <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
## $ `Nonalcoholic Steatohepatitis` <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ?, ~
## $ `Esophageal Varices` <fct> 1, 1, 0, 0, 0, 1, 0, 0, ?, 0, ?, ?, ?, ~
## $ Splenomegaly <fct> 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, ~
## $ `Portal Hypertension` <fct> 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, ~
## $ `Portal Vein Thrombosis` <fct> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, ~
## $ `Liver Metastasis: nominal` <fct> 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
## $ `Radiological Hallmark` <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, ~
## $ `Performance Status` <fct> 0, 0, 2, 0, 0, 1, 0, 3, 1, 0, 0, 0, 0, ~
## $ `Encefalopathy degree` <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~
## $ `Ascites degree` <fct> 1, 1, 2, 1, 1, 2, 1, 1, 2, 1, 2, 1, 1, ~
## $ Class <fct> 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, ~
HCCKategorik<-na_if(HCCKategorik,"?")
glimpse(HCCKategorik)
## Rows: 165
## Columns: 28
## $ Patients <fct> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, ~
## $ Gender <fct> 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~
## $ Symptoms <fct> 0, NA, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0,~
## $ Alcohol <fct> 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, ~
## $ `Hepatitis B Surface Antigen` <fct> 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ~
## $ `Hepatitis B e Antigen` <fct> 0, 0, 0, 0, 0, NA, NA, NA, 0, 0, 0, 0, ~
## $ `Hepatitis B Core Antibody` <fct> 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, ~
## $ `Hepatitis C Virus Antibody` <fct> 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, ~
## $ Cirrhosis <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~
## $ `Endemic Countries` <fct> 0, NA, 0, 0, 0, 0, 0, 0, 0, 0, NA, 1, 0~
## $ Smoking <fct> 1, NA, 1, 1, 1, NA, 0, 1, 1, 0, NA, 0, ~
## $ Diabetes <fct> 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, ~
## $ Obesity <fct> NA, 0, 0, 0, 0, 1, 0, NA, 0, 0, 0, 0, 0~
## $ Hemochromatosis <fct> 1, 0, 0, 0, 0, 0, NA, 0, 0, 1, 0, 0, 0,~
## $ `Arterial Hypertension` <fct> 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, ~
## $ `Chronic Renal Insufficiency` <fct> 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ~
## $ `Human Immunodeficiency Virus` <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
## $ `Nonalcoholic Steatohepatitis` <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NA,~
## $ `Esophageal Varices` <fct> 1, 1, 0, 0, 0, 1, 0, 0, NA, 0, NA, NA, ~
## $ Splenomegaly <fct> 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, ~
## $ `Portal Hypertension` <fct> 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, ~
## $ `Portal Vein Thrombosis` <fct> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, ~
## $ `Liver Metastasis: nominal` <fct> 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
## $ `Radiological Hallmark` <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, ~
## $ `Performance Status` <fct> 0, 0, 2, 0, 0, 1, 0, 3, 1, 0, 0, 0, 0, ~
## $ `Encefalopathy degree` <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~
## $ `Ascites degree` <fct> 1, 1, 2, 1, 1, 2, 1, 1, 2, 1, 2, 1, 1, ~
## $ Class <fct> 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, ~
library(dplyr)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
import <- data.frame(Test_Name = "Fir'st Last",
Test_Date = "2019-01-01",
Test_Number = 10)
import_sql <-import %>%
select_if(~!all(is.na(.))) %>%
mutate_if(is.factor, as.character) %>%
mutate_if(is.character, trimws) %>%
mutate_if(is.character, list(~gsub("'", "''",.))) %>%
mutate_if(is.character, list(~paste0("'", ., "'"))) %>%
mutate_if(is.Date, list(~paste0("'", ., "'")))
options(repr.plot.width=6,repr.plot.height=4)
missing_data<-HCCKategorik%>%summarise_all(funs(sum(is.na(.))/n()))
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
missing_data<-gather(missing_data,key="variables",value="percent_missing")
ggplot(missing_data, aes(x=reorder(variables, percent_missing),y=percent_missing))+
geom_bar(stat="identity",fill="red",aes(color=I('white')), size=0.3)+
xlab('variables')+
coord_flip()+
theme_bw()
dapat dilihat indikasi yaitu yang berwarna merah dalam beberapa atribut menunjukkan persentase missing value nya, paling banyak terdapat pada observasi dalam atribut eshopageal.Varices, dan terdapat 5 variabel yang sudah lengkap
imput<-compute(HCCKategorik,method="median/mode")
HCCKategorik_nonmissing<-impute(HCCKategorik,object=imput)
glimpse(HCCKategorik_nonmissing)
## Rows: 165
## Columns: 28
## $ Patients <fct> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13~
## $ Gender <fct> 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,~
## $ Symptoms <fct> 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1,~
## $ Alcohol <fct> 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0,~
## $ Hepatitis.B.Surface.Antigen <fct> 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ Hepatitis.B.e.Antigen <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ Hepatitis.B.Core.Antibody <fct> 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,~
## $ Hepatitis.C.Virus.Antibody <fct> 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1,~
## $ Cirrhosis <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,~
## $ Endemic.Countries <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,~
## $ Smoking <fct> 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1,~
## $ Diabetes <fct> 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0,~
## $ Obesity <fct> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ Hemochromatosis <fct> 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,~
## $ Arterial.Hypertension <fct> 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0,~
## $ Chronic.Renal.Insufficiency <fct> 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ Human.Immunodeficiency.Virus <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ Nonalcoholic.Steatohepatitis <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ Esophageal.Varices <fct> 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0,~
## $ Splenomegaly <fct> 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0,~
## $ Portal.Hypertension <fct> 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0,~
## $ Portal.Vein.Thrombosis <fct> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,~
## $ Liver.Metastasis..nominal <fct> 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ Radiological.Hallmark <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,~
## $ Performance.Status <fct> 0, 0, 2, 0, 0, 1, 0, 3, 1, 0, 0, 0, 0, 0,~
## $ Encefalopathy.degree <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,~
## $ Ascites.degree <fct> 1, 1, 2, 1, 1, 2, 1, 1, 2, 1, 2, 1, 1, 1,~
## $ Class <fct> 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1,~
Terlihat secara sekilas dari ke 11 observasi tidak ada missing value, kemudian dilanjutkan pemeriksaan kembali untuk keseluruhan observasi
options(repr.plot.width=6,repr.plot.height=4)
missing_data<-HCCKategorik_nonmissing %>% summarise_all(funs(sum(is.na(.))/n()))
missing_data<-gather(missing_data,key="variables",value="percent_missing")
ggplot(missing_data, aes(x=reorder(variables, percent_missing),y=percent_missing))+
geom_bar(stat="identity",fill="red",aes(color=I('white')), size=0.3)+
xlab('variables')+
coord_flip()+
theme_bw()
Berdasarkan hasil output dapat diperoleh bahwa hasil pemeriksaan sudah menunjukkan tidak terdapat missing value pada seluruh atribut
ggplot(HCCKategorik_nonmissing, aes(x=Gender,fill=Class))+ geom_bar(position='fill')+xlab("Gender")
ggplot(HCCKategorik_nonmissing, aes(x=Symptoms,fill=Class))+ geom_bar(position='fill')+xlab("Symptoms")
ggplot(HCCKategorik_nonmissing, aes(x=Alcohol,fill=Class))+ geom_bar(position='fill')+xlab("Alcohol")
ggplot(HCCKategorik_nonmissing, aes(x=Hepatitis.B.Surface.Antigen,fill=Class))+ geom_bar(position='fill')+xlab("Hepatitis.B.Surface.Antigen")
ggplot(HCCKategorik_nonmissing, aes(x=Hepatitis.B.e.Antigen,fill=Class))+ geom_bar(position='fill')+xlab("Hepatitis.B.e.Antigen")
ggplot(HCCKategorik_nonmissing, aes(x=Hepatitis.B.Core.Antibody,fill=Class))+ geom_bar(position='fill')+xlab("Hepatitis.B.Core.Antibody")
ggplot(HCCKategorik_nonmissing, aes(x=Hepatitis.C.Virus.Antibody,fill=Class))+ geom_bar(position='fill')+xlab("Hepatitis.C.Virus.Antibody")
ggplot(HCCKategorik_nonmissing, aes(x=Cirrhosis,fill=Class))+ geom_bar(position='fill')+xlab("Cirrhosis")
ggplot(HCCKategorik_nonmissing, aes(x=Endemic.Countries,fill=Class))+ geom_bar(position='fill')+xlab("Endemic.Countries")
ggplot(HCCKategorik_nonmissing, aes(x=Smoking,fill=Class))+ geom_bar(position='fill')+xlab("Smoking")
ggplot(HCCKategorik_nonmissing, aes(x=Diabetes,fill=Class))+ geom_bar(position='fill')+xlab("Diabetes")
ggplot(HCCKategorik_nonmissing, aes(x=Obesity,fill=Class))+ geom_bar(position='fill')+xlab("Obesity")
ggplot(HCCKategorik_nonmissing, aes(x=Hemochromatosis,fill=Class))+ geom_bar(position='fill')+xlab("Hemochromatosis")
ggplot(HCCKategorik_nonmissing, aes(x=Arterial.Hypertension,fill=Class))+ geom_bar(position='fill')+xlab("Arterial.Hypertension")
ggplot(HCCKategorik_nonmissing, aes(x=Chronic.Renal.Insufficiency,fill=Class))+ geom_bar(position='fill')+xlab("Chronic.Renal.Insufficiency")
ggplot(HCCKategorik_nonmissing, aes(x=Human.Immunodeficiency.Virus,fill=Class))+ geom_bar(position='fill')+xlab("Human.Immunodeficiency.Virus")
ggplot(HCCKategorik_nonmissing, aes(x=Nonalcoholic.Steatohepatitis,fill=Class))+ geom_bar(position='fill')+xlab("Nonalcoholic.Steatohepatitis")
ggplot(HCCKategorik_nonmissing, aes(x=Esophageal.Varices,fill=Class))+ geom_bar(position='fill')+xlab("Esophageal.Varices")
ggplot(HCCKategorik_nonmissing, aes(x=Splenomegaly,fill=Class))+ geom_bar(position='fill')+xlab("Splenomegaly")
ggplot(HCCKategorik_nonmissing, aes(x=Portal.Hypertension,fill=Class))+ geom_bar(position='fill')+xlab("Portal.Hypertension")
ggplot(HCCKategorik_nonmissing, aes(x=Portal.Vein.Thrombosis,fill=Class))+ geom_bar(position='fill')+xlab("Portal.Vein.Thrombosis")
ggplot(HCCKategorik_nonmissing, aes(x=Liver.Metastasis..nominal,fill=Class))+ geom_bar(position='fill')+xlab("Liver.Metastasis..nominal")
ggplot(HCCKategorik_nonmissing, aes(x=Radiological.Hallmark ,fill=Class))+ geom_bar(position='fill')+xlab("Radiological.Hallmark ")
ggplot(HCCKategorik_nonmissing, aes(x=Performance.Status,fill=Class))+ geom_bar(position='fill')+xlab("Performance.Status")
ggplot(HCCKategorik_nonmissing, aes(x=Encefalopathy.degree,fill=Class))+ geom_bar(position='fill')+xlab("Encefalopathy.degree")
ggplot(HCCKategorik_nonmissing, aes(x=Ascites.degree,fill=Class))+ geom_bar(position='fill')+xlab("Ascites.degree")
Kemudian dapat dilakuakan pemilihan atribut berdasarkan pengamatan signifikansi secara visual