3.baskı, Seçkin Yayınevi, Ankara.
CRISP-DM (Cross Industry Standard Process for Data Mining)
install.packages("lattice")
install.packages("mice")
install.packages("VIM")
install.packages("readxl")
install.packages("partykit")
install.packages("fastDummies")
install.packages("caret")
veri <- read_excel("data/adl_missing.xlsx")
veri <- data.frame(veri,stringsAsFactors=FALSE)
head(veri)
## Kalp Yas Diyabet Tansiyon Sigara Egzersiz ## 1 1 67 0 0 0 0 ## 2 1 75 1 1 0 0 ## 3 1 76 0 0 0 0 ## 4 1 67 1 0 1 0 ## 5 0 65 0 0 0 1 ## 6 1 74 0 0 0 0
sum(is.na(veri))
## [1] 9
mean(is.na(veri))
## [1] 0.075
Çıktıdan görüldüğü gibi 9 değer kayıptır ve kayıp değerlerin oranı %7.5 olarak verilmektedir.
summary(veri)
## Kalp Yas Diyabet Tansiyon ## Min. :0.0 Min. :42.00 Min. :0.0000 Min. :0.0000 ## 1st Qu.:0.0 1st Qu.:65.00 1st Qu.:0.0000 1st Qu.:0.0000 ## Median :1.0 Median :68.00 Median :0.0000 Median :0.0000 ## Mean :0.7 Mean :64.88 Mean :0.2105 Mean :0.2105 ## 3rd Qu.:1.0 3rd Qu.:71.00 3rd Qu.:0.0000 3rd Qu.:0.0000 ## Max. :1.0 Max. :76.00 Max. :1.0000 Max. :1.0000 ## NA's :3 NA's :1 NA's :1 ## Sigara Egzersiz ## Min. :0.0000 Min. :0.0000 ## 1st Qu.:0.0000 1st Qu.:0.0000 ## Median :0.0000 Median :0.0000 ## Mean :0.1176 Mean :0.3158 ## 3rd Qu.:0.0000 3rd Qu.:1.0000 ## Max. :1.0000 Max. :1.0000 ## NA's :3 NA's :1
Örneğin bir verinin başka bir programdan aktarıldığını ve kayıp değerlerin 99 şeklinde kodlandığını varsayalım. Yas sütunundaki 99 değerlerini NA sembolüyle değiştirmek için aşağıda verilen komut uygulanabilir.
veri$Yas[veri$Yas==99]<-NA
havaKalitesi <- airquality[-c(5,6)] # Verinin bir kısmının görüntülenmesi head(havaKalitesi)
## Ozone Solar.R Wind Temp ## 1 41 190 7.4 67 ## 2 36 118 8.0 72 ## 3 12 149 12.6 74 ## 4 18 313 11.5 62 ## 5 NA NA 14.3 56 ## 6 28 NA 14.9 66
# Tanımlayıcı istatistikler summary(havaKalitesi)
## Ozone Solar.R Wind Temp ## Min. : 1.00 Min. : 7.0 Min. : 1.700 Min. :56.00 ## 1st Qu.: 18.00 1st Qu.:115.8 1st Qu.: 7.400 1st Qu.:72.00 ## Median : 31.50 Median :205.0 Median : 9.700 Median :79.00 ## Mean : 42.13 Mean :185.9 Mean : 9.958 Mean :77.88 ## 3rd Qu.: 63.25 3rd Qu.:258.8 3rd Qu.:11.500 3rd Qu.:85.00 ## Max. :168.00 Max. :334.0 Max. :20.700 Max. :97.00 ## NA's :37 NA's :7
havaKalitesi[5:12,3] <- NA havaKalitesi[3:8,4] <- NA summary(havaKalitesi)
## Ozone Solar.R Wind Temp ## Min. : 1.00 Min. : 7.0 Min. : 1.700 Min. :57.00 ## 1st Qu.: 18.00 1st Qu.:115.8 1st Qu.: 7.400 1st Qu.:73.00 ## Median : 31.50 Median :205.0 Median : 9.700 Median :79.00 ## Mean : 42.13 Mean :185.9 Mean : 9.839 Mean :78.46 ## 3rd Qu.: 63.25 3rd Qu.:258.8 3rd Qu.:11.500 3rd Qu.:85.00 ## Max. :168.00 Max. :334.0 Max. :20.700 Max. :97.00 ## NA's :37 NA's :7 NA's :8 NA's :6
kayipOran <- function(x){sum(is.na(x))/length(x)*100}
apply(havaKalitesi,2,kayipOran)
## Ozone Solar.R Wind Temp ## 24.183007 4.575163 5.228758 3.921569
Ozon miktarına ilişkin ölçümlerdeki kayıp değer oranı %24.18, güneş radyasyonu değerine ilişkin ölçümlerdeki kayıp değer oranı %4.57, rüzgara ilişkin ölçümlerdeki kayıp değer oranı %5.23 ve sıcaklığa ilişkin ölçümlerdeki kayıp değer oranı %3.92 olarak gözükmektedir.
apply(havaKalitesi,1,kayipOran)
## [1] 0 0 25 25 100 75 50 50 25 50 50 25 0 0 0 0 0 0 ## [19] 0 0 0 0 0 0 25 25 50 0 0 0 0 25 25 25 25 25 ## [37] 25 0 25 0 0 25 25 0 25 25 0 0 0 0 0 25 25 25 ## [55] 25 25 25 25 25 25 25 0 0 0 25 0 0 0 0 0 0 25 ## [73] 0 0 25 0 0 0 0 0 0 0 25 25 0 0 0 0 0 0 ## [91] 0 0 0 0 0 25 25 25 0 0 0 25 25 0 0 0 25 0 ## [109] 0 0 0 0 0 0 25 0 0 0 25 0 0 0 0 0 0 0 ## [127] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ## [145] 0 0 0 0 0 25 0 0 0
Gözlemler açısından incelendiğinde ise kayıp değer oranının örneğin 3. gözlemde (satırda) %25, 5. gözlemde ise %100 olduğu görülmektedir.
md.pattern(havaKalitesi,rotate.names = TRUE)
## Temp Solar.R Wind Ozone ## 105 1 1 1 1 0 ## 34 1 1 1 0 1 ## 2 1 1 0 1 1 ## 1 1 1 0 0 2 ## 3 1 0 1 1 1 ## 1 1 0 1 0 2 ## 1 1 0 0 1 2 ## 2 0 1 1 1 1 ## 2 0 1 0 1 2 ## 1 0 0 0 1 3 ## 1 0 0 0 0 4 ## 6 7 8 37 58
md.pattern(havaKalitesi,rotate.names = TRUE,plot = FALSE)
## Temp Solar.R Wind Ozone ## 105 1 1 1 1 0 ## 34 1 1 1 0 1 ## 2 1 1 0 1 1 ## 1 1 1 0 0 2 ## 3 1 0 1 1 1 ## 1 1 0 1 0 2 ## 1 1 0 0 1 2 ## 2 0 1 1 1 1 ## 2 0 1 0 1 2 ## 1 0 0 0 1 3 ## 1 0 0 0 0 4 ## 6 7 8 37 58
aggr_plot <- aggr(havaKalitesi, col=c('navyblue','red'),numbers=TRUE,sortVars=TRUE,
labels=names(data),cex.axis=.7,gap=3, ylab=c("Histogram","Pattern"))
## ## Variables sorted by number of missings: ## Variable Count ## Ozone 0.24183007 ## Wind 0.05228758 ## Solar.R 0.04575163 ## Temp 0.03921569
marginplot(havaKalitesi[c(1,2)])
veriTemp<-veri na.omit(veriTemp)
## Kalp Yas Diyabet Tansiyon Sigara Egzersiz ## 1 1 67 0 0 0 0 ## 2 1 75 1 1 0 0 ## 3 1 76 0 0 0 0 ## 4 1 67 1 0 1 0 ## 5 0 65 0 0 0 1 ## 6 1 74 0 0 0 0 ## 7 1 69 0 0 0 1 ## 8 0 54 1 1 0 1 ## 12 1 66 0 0 0 1 ## 16 1 69 0 1 1 0 ## 17 0 42 0 0 0 1 ## 18 1 73 0 0 0 0 ## 19 1 68 0 0 0 0 ## 20 1 70 0 0 0 1
cpp<-c(80,75,55,60,40,40,50,80,80,30) python<-c(90,NA,50,50,50,40,60,90,80,20) ist<-c(60,60,60,40,30,NA,70,70,60,NA) mat<-c(50,60,70,30,20,30,60,30,50,40) notlar1<-data.frame(cpp,python,ist,mat) notlar1
## cpp python ist mat ## 1 80 90 60 50 ## 2 75 NA 60 60 ## 3 55 50 60 70 ## 4 60 50 40 30 ## 5 40 50 30 20 ## 6 40 40 NA 30 ## 7 50 60 70 60 ## 8 80 90 70 30 ## 9 80 80 60 50 ## 10 30 20 NA 40
notTemp<-notlar1 na.omit(notTemp)
## cpp python ist mat ## 1 80 90 60 50 ## 3 55 50 60 70 ## 4 60 50 40 30 ## 5 40 50 30 20 ## 7 50 60 70 60 ## 8 80 90 70 30 ## 9 80 80 60 50
methods(mice)
## [1] mice.impute.2l.bin mice.impute.2l.lmer mice.impute.2l.norm ## [4] mice.impute.2l.pan mice.impute.2lonly.mean mice.impute.2lonly.norm ## [7] mice.impute.2lonly.pmm mice.impute.cart mice.impute.jomoImpute ## [10] mice.impute.lda mice.impute.logreg mice.impute.logreg.boot ## [13] mice.impute.mean mice.impute.midastouch mice.impute.mnar.logreg ## [16] mice.impute.mnar.norm mice.impute.norm mice.impute.norm.boot ## [19] mice.impute.norm.nob mice.impute.norm.predict mice.impute.panImpute ## [22] mice.impute.passive mice.impute.pmm mice.impute.polr ## [25] mice.impute.polyreg mice.impute.quadratic mice.impute.rf ## [28] mice.impute.ri mice.impute.sample mice.mids ## [31] mice.theme ## see '?methods' for accessing help and source code
Yukarıdaki yöntemlerin açıklamaları için kaynak kitaba bakınız.
cpp<-c(80,75,55,60,40,40,50,80,80,30) python<-c(90,NA,50,50,50,40,60,90,80,20) notlar2<-data.frame(cpp,python) notlar2
## cpp python ## 1 80 90 ## 2 75 NA ## 3 55 50 ## 4 60 50 ## 5 40 50 ## 6 40 40 ## 7 50 60 ## 8 80 90 ## 9 80 80 ## 10 30 20
tempNotlar2 <- mice(notlar2,m=5,maxit=50)
summary(tempNotlar2)
## Class: mids ## Number of multiple imputations: 5 ## Imputation methods: ## cpp python ## "" "pmm" ## PredictorMatrix: ## cpp python ## cpp 0 1 ## python 1 0
tempNotlar2$imp
## $cpp ## [1] 1 2 3 4 5 ## <0 rows> (or 0-length row.names) ## ## $python ## 1 2 3 4 5 ## 2 50 80 50 90 50
complete(tempNotlar2,1)
## cpp python ## 1 80 90 ## 2 75 50 ## 3 55 50 ## 4 60 50 ## 5 40 50 ## 6 40 40 ## 7 50 60 ## 8 80 90 ## 9 80 80 ## 10 30 20
xyplot(tempNotlar2,python ~ cpp,pch=8,cex=1.2)
modelFit1 <- with(tempNotlar2,lm(python ~ cpp)) sonuc<-summary(pool(modelFit1)) sonuc
## term estimate std.error statistic df p.value ## 1 (Intercept) -4.407407 12.6275147 -0.3490321 5.767372 0.739453280 ## 2 cpp 1.081481 0.2205411 4.9037650 4.736446 0.005157829
tempHava <- mice(havaKalitesi,m=5,maxit=50,meth='pmm',seed=500)
summary(tempHava)
## Class: mids ## Number of multiple imputations: 5 ## Imputation methods: ## Ozone Solar.R Wind Temp ## "pmm" "pmm" "pmm" "pmm" ## PredictorMatrix: ## Ozone Solar.R Wind Temp ## Ozone 0 1 1 1 ## Solar.R 1 0 1 1 ## Wind 1 1 0 1 ## Temp 1 1 1 0
tempHava$imp$Ozone
## 1 2 3 4 5 ## 5 46 30 41 115 65 ## 10 13 21 20 24 18 ## 25 12 37 14 14 6 ## 26 11 32 37 4 19 ## 27 23 13 18 20 7 ## 32 49 47 28 35 46 ## 33 23 28 20 52 20 ## 34 18 23 28 8 18 ## 35 37 28 71 40 23 ## 36 108 89 48 47 39 ## 37 24 22 7 44 45 ## 39 96 85 168 50 79 ## 42 50 66 50 73 73 ## 43 61 79 168 50 82 ## 45 39 52 20 7 52 ## 46 40 45 63 20 63 ## 52 59 49 23 39 32 ## 53 44 96 49 64 89 ## 54 31 47 32 23 71 ## 55 40 64 35 71 35 ## 56 52 20 52 36 18 ## 57 44 39 20 28 29 ## 58 7 13 21 30 10 ## 59 18 16 36 16 44 ## 60 13 4 13 19 13 ## 61 39 28 37 46 32 ## 65 30 31 23 23 52 ## 72 45 59 32 63 18 ## 75 115 20 46 16 59 ## 83 28 59 40 37 37 ## 84 46 39 32 18 18 ## 102 50 79 96 110 64 ## 103 20 65 45 20 31 ## 107 27 7 41 41 30 ## 115 24 45 30 44 16 ## 119 80 96 66 82 110 ## 150 13 32 27 13 24
xyplot(tempHava,Ozone ~ Wind+Temp+Solar.R,pch=18,cex=1.2)
densityplot(tempHava)
modelFit1 <- with(tempHava,lm(Ozone~ Wind+Temp+Solar.R)) summary(pool(modelFit1))
## term estimate std.error statistic df p.value ## 1 (Intercept) -56.8040854 21.86949931 -2.597411 64.75451 1.161445e-02 ## 2 Wind -3.4314836 0.59293824 -5.787253 95.39864 9.086850e-08 ## 3 Temp 1.5505290 0.23950239 6.473961 79.99778 7.129456e-09 ## 4 Solar.R 0.0605242 0.02555605 2.368293 21.09575 2.749509e-02
WeatherPlay
## outlook temperature humidity windy play ## 1 sunny 85 85 false no ## 2 sunny 80 90 true no ## 3 overcast 83 86 false yes ## 4 rainy 70 96 false yes ## 5 rainy 68 80 false yes ## 6 rainy 65 70 true no ## 7 overcast 64 65 true yes ## 8 sunny 72 95 false no ## 9 sunny 69 70 false yes ## 10 rainy 75 80 false yes ## 11 sunny 75 70 true yes ## 12 overcast 72 90 true yes ## 13 overcast 81 75 false yes ## 14 rainy 71 91 true no
sapply(WeatherPlay, class)
## outlook temperature humidity windy play ## "factor" "numeric" "numeric" "factor" "factor"
havaOyun<-WeatherPlay havaOyun$temperature[havaOyun$temperature>=80]<-"Y" havaOyun$temperature[havaOyun$temperature<80 & havaOyun$temperature>=70]<-"N" havaOyun$temperature[havaOyun$temperature<70]<-"D" havaOyun$humidity[havaOyun$humidity>80]<-"Y" havaOyun$humidity[havaOyun$humidity<=80]<-"N" havaOyun
## outlook temperature humidity windy play ## 1 sunny Y Y false no ## 2 sunny Y Y true no ## 3 overcast Y Y false yes ## 4 rainy N Y false yes ## 5 rainy D N false yes ## 6 rainy D N true no ## 7 overcast D N true yes ## 8 sunny N Y false no ## 9 sunny D N false yes ## 10 rainy N N false yes ## 11 sunny N N true yes ## 12 overcast N Y true yes ## 13 overcast Y N false yes ## 14 rainy N Y true no
havaOyun
## outlook temperature humidity windy play ## 1 sunny Y Y false no ## 2 sunny Y Y true no ## 3 overcast Y Y false yes ## 4 rainy N Y false yes ## 5 rainy D N false yes ## 6 rainy D N true no ## 7 overcast D N true yes ## 8 sunny N Y false no ## 9 sunny D N false yes ## 10 rainy N N false yes ## 11 sunny N N true yes ## 12 overcast N Y true yes ## 13 overcast Y N false yes ## 14 rainy N Y true no
R<-c(50,75,75,60,40,90,50,80,80,30) python<-c(90,80,50,50,50,40,60,90,80,20) data.frame(R,python)
## R python ## 1 50 90 ## 2 75 80 ## 3 75 50 ## 4 60 50 ## 5 40 50 ## 6 90 40 ## 7 50 60 ## 8 80 90 ## 9 80 80 ## 10 30 20
R01<-(R-min(R))/(max(R)-min(R)) python01<-(python-min(python))/(max(python)-min(python)) data.frame(R01,python01)
## R01 python01 ## 1 0.3333333 1.0000000 ## 2 0.7500000 0.8571429 ## 3 0.7500000 0.4285714 ## 4 0.5000000 0.4285714 ## 5 0.1666667 0.4285714 ## 6 1.0000000 0.2857143 ## 7 0.3333333 0.5714286 ## 8 0.8333333 1.0000000 ## 9 0.8333333 0.8571429 ## 10 0.0000000 0.0000000
R04<-4*(R-min(R))/(max(R)-min(R)) python04<-4*(python-min(python))/(max(python)-min(python)) data.frame(R04,python04)
## R04 python04 ## 1 1.3333333 4.000000 ## 2 3.0000000 3.428571 ## 3 3.0000000 1.714286 ## 4 2.0000000 1.714286 ## 5 0.6666667 1.714286 ## 6 4.0000000 1.142857 ## 7 1.3333333 2.285714 ## 8 3.3333333 4.000000 ## 9 3.3333333 3.428571 ## 10 0.0000000 0.000000
RZ<-(R-mean(R))/sd(R) pythonZ<-(python-mean(python))/sd(python) data.frame(RZ,pythonZ)
## RZ pythonZ ## 1 -0.6536415 1.24412731 ## 2 0.6033614 0.81511789 ## 3 0.6033614 -0.47191036 ## 4 -0.1508403 -0.47191036 ## 5 -1.1564426 -0.47191036 ## 6 1.3575631 -0.90091978 ## 7 -0.6536415 -0.04290094 ## 8 0.8547619 1.24412731 ## 9 0.8547619 0.81511789 ## 10 -1.6592438 -1.75893861
ogrenci <- data.frame(Cinsiyet = c("E","K","E","K","K"),Ders = c("C","R","R","R","C"))
ogrenci
## Cinsiyet Ders ## 1 E C ## 2 K R ## 3 E R ## 4 K R ## 5 K C
dummy_cols(ogrenci)
## Cinsiyet Ders Cinsiyet_E Cinsiyet_K Ders_C Ders_R ## 1 E C 1 0 1 0 ## 2 K R 0 1 0 1 ## 3 E R 1 0 0 1 ## 4 K R 0 1 0 1 ## 5 K C 0 1 1 0
dummy_cols(ogrenci,select_columns = c("Cinsiyet"))
## Cinsiyet Ders Cinsiyet_E Cinsiyet_K ## 1 E C 1 0 ## 2 K R 0 1 ## 3 E R 1 0 ## 4 K R 0 1 ## 5 K C 0 1
set.seed(1234) egitimGozlemNo<-sort(sample(1:dim(havaOyun)[1],ceiling(0.80*dim(havaOyun)[1]))) egitimGozlemNo
## [1] 2 3 4 5 6 8 9 10 11 12 13 14
havaEgitim<-havaOyun[egitimGozlemNo,] havaTest<-havaOyun[-egitimGozlemNo,]
havaEgitim
## outlook temperature humidity windy play ## 2 sunny Y Y true no ## 3 overcast Y Y false yes ## 4 rainy N Y false yes ## 5 rainy D N false yes ## 6 rainy D N true no ## 8 sunny N Y false no ## 9 sunny D N false yes ## 10 rainy N N false yes ## 11 sunny N N true yes ## 12 overcast N Y true yes ## 13 overcast Y N false yes ## 14 rainy N Y true no
havaTest
## outlook temperature humidity windy play ## 1 sunny Y Y false no ## 7 overcast D N true yes
set.seed(1234) egitimGozlemNo<-createDataPartition(havaOyun$play,p=0.7,list=FALSE,times=1) egitimGozlemNo
## Resample1 ## [1,] 2 ## [2,] 3 ## [3,] 4 ## [4,] 6 ## [5,] 7 ## [6,] 8 ## [7,] 9 ## [8,] 10 ## [9,] 11 ## [10,] 12 ## [11,] 14
havaEgitim<-havaOyun[egitimGozlemNo,] havaTest<-havaOyun[-egitimGozlemNo,] havaEgitim
## outlook temperature humidity windy play ## 2 sunny Y Y true no ## 3 overcast Y Y false yes ## 4 rainy N Y false yes ## 6 rainy D N true no ## 7 overcast D N true yes ## 8 sunny N Y false no ## 9 sunny D N false yes ## 10 rainy N N false yes ## 11 sunny N N true yes ## 12 overcast N Y true yes ## 14 rainy N Y true no
havaTest
## outlook temperature humidity windy play ## 1 sunny Y Y false no ## 5 rainy D N false yes ## 13 overcast Y N false yes
zaman<-data.frame(2000:2020) dilimler<-createTimeSlices(zaman$X2000.2020, 5, 2,fixedWindow = TRUE) # Örneğin eğitim ve test işlemleri için belli bir dilim alınmak istensin zaman[dilimler$train$Training09,]
## [1] 2004 2005 2006 2007 2008
zaman[dilimler$test$Testing09,]
## [1] 2009 2010