if(!require("faraway")){
install.packages("faraway")
}
library(faraway)
library(dplyr)
library(ggplot2)
library(corrplot)
library(MASS)
data(pima) # loads "pima"" dataset
str (pima)
## 'data.frame': 768 obs. of 9 variables:
## $ pregnant : int 6 1 8 1 0 5 3 10 2 8 ...
## $ glucose : int 148 85 183 89 137 116 78 115 197 125 ...
## $ diastolic: int 72 66 64 66 40 74 50 0 70 96 ...
## $ triceps : int 35 29 0 23 35 0 32 0 45 0 ...
## $ insulin : int 0 0 0 94 168 0 88 0 543 0 ...
## $ bmi : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ diabetes : num 0.627 0.351 0.672 0.167 2.288 ...
## $ age : int 50 31 32 21 33 30 26 29 53 54 ...
## $ test : int 1 0 1 0 1 0 1 0 1 1 ...
head(pima)
## pregnant glucose diastolic triceps insulin bmi diabetes age test
## 1 6 148 72 35 0 33.6 0.627 50 1
## 2 1 85 66 29 0 26.6 0.351 31 0
## 3 8 183 64 0 0 23.3 0.672 32 1
## 4 1 89 66 23 94 28.1 0.167 21 0
## 5 0 137 40 35 168 43.1 2.288 33 1
## 6 5 116 74 0 0 25.6 0.201 30 0
tail(pima)
## pregnant glucose diastolic triceps insulin bmi diabetes age test
## 763 9 89 62 0 0 22.5 0.142 33 0
## 764 10 101 76 48 180 32.9 0.171 63 0
## 765 2 122 70 27 0 36.8 0.340 27 0
## 766 5 121 72 23 112 26.2 0.245 30 0
## 767 1 126 60 0 0 30.1 0.349 47 1
## 768 1 93 70 31 0 30.4 0.315 23 0
summary(pima) # u summary trazimo nepravilnosti i generalni opis podataka
## pregnant glucose diastolic triceps
## Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 62.00 1st Qu.: 0.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :23.00
## Mean : 3.845 Mean :120.9 Mean : 69.11 Mean :20.54
## 3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00 3rd Qu.:32.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## insulin bmi diabetes age
## Min. : 0.0 Min. : 0.00 Min. :0.0780 Min. :21.00
## 1st Qu.: 0.0 1st Qu.:27.30 1st Qu.:0.2437 1st Qu.:24.00
## Median : 30.5 Median :32.00 Median :0.3725 Median :29.00
## Mean : 79.8 Mean :31.99 Mean :0.4719 Mean :33.24
## 3rd Qu.:127.2 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.0 Max. :67.10 Max. :2.4200 Max. :81.00
## test
## Min. :0.000
## 1st Qu.:0.000
## Median :0.000
## Mean :0.349
## 3rd Qu.:1.000
## Max. :1.000
Kolona test uzima vrednosti 0 ili 1 u zavisnosti od toga da li data osoba pati od dijabetesa ili ne. Ovakve promenljive su kategoricke - u R-u se predstavljaju kao faktori. Kao sto se vidi u ovom slucaju kolona test
nije tretirana kao kategoricka, a trebalo bi. Zato sami pravimo “faktor”:
pima$test <- factor(pima$test, labels = c("negative", "positive"))
str(pima)
## 'data.frame': 768 obs. of 9 variables:
## $ pregnant : int 6 1 8 1 0 5 3 10 2 8 ...
## $ glucose : int 148 85 183 89 137 116 78 115 197 125 ...
## $ diastolic: int 72 66 64 66 40 74 50 0 70 96 ...
## $ triceps : int 35 29 0 23 35 0 32 0 45 0 ...
## $ insulin : int 0 0 0 94 168 0 88 0 543 0 ...
## $ bmi : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ diabetes : num 0.627 0.351 0.672 0.167 2.288 ...
## $ age : int 50 31 32 21 33 30 26 29 53 54 ...
## $ test : Factor w/ 2 levels "negative","positive": 2 1 2 1 2 1 2 1 2 2 ...
summary(pima$test)
## negative positive
## 500 268
Nastavljamo sa daljom analizom seta:
# kolona diastolic oznacava krvni pritisak koji ni u kom slucaju ne bi smeo da ima vrednost nula => greska je u unesenim podacima
head(pima$diastolic, 100)
## [1] 72 66 64 66 40 74 50 0 70 96 92 74 80 60 72 0 84
## [18] 74 30 70 88 84 90 80 94 70 76 66 82 92 75 76 58 92
## [35] 78 60 76 76 68 72 64 84 92 110 64 66 56 70 66 0 80
## [52] 50 66 90 66 50 68 88 82 64 0 72 62 58 66 74 88 92
## [69] 66 85 66 64 90 86 75 48 78 72 0 66 44 0 78 65 108
## [86] 74 72 68 70 68 55 80 78 72 82 72 62 48 50 90
head(sort(pima$diastolic), 100)
## [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [24] 0 0 0 0 0 0 0 0 0 0 0 0 24 30 30 38 40 44 44 44 44 46 46
## [47] 48 48 48 48 48 50 50 50 50 50 50 50 50 50 50 50 50 50 52 52 52 52 52
## [70] 52 52 52 52 52 52 54 54 54 54 54 54 54 54 54 54 54 55 55 56 56 56 56
## [93] 56 56 56 56 56 56 56 56
#slicno, vrednost nula ne bi smele da imaju ni promenljive (kolone) glucose, insulin, bmi,..
#svuda gde je nula stavljamo NA vrednosti
pima$diastolic[pima$diastolic == 0] <- NA
pima$glucose[pima$glucose == 0] <- NA
pima$triceps[pima$triceps == 0] <- NA
pima$insulin[pima$insulin == 0] <- NA
pima$bmi[pima$bmi == 0] <- NA
# proverimo kako sada izgleda nas set
summary(pima)
## pregnant glucose diastolic triceps
## Min. : 0.000 Min. : 44.0 Min. : 24.00 Min. : 7.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 64.00 1st Qu.:22.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :29.00
## Mean : 3.845 Mean :121.7 Mean : 72.41 Mean :29.15
## 3rd Qu.: 6.000 3rd Qu.:141.0 3rd Qu.: 80.00 3rd Qu.:36.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## NA's :5 NA's :35 NA's :227
## insulin bmi diabetes age
## Min. : 14.00 Min. :18.20 Min. :0.0780 Min. :21.00
## 1st Qu.: 76.25 1st Qu.:27.50 1st Qu.:0.2437 1st Qu.:24.00
## Median :125.00 Median :32.30 Median :0.3725 Median :29.00
## Mean :155.55 Mean :32.46 Mean :0.4719 Mean :33.24
## 3rd Qu.:190.00 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.00 Max. :67.10 Max. :2.4200 Max. :81.00
## NA's :374 NA's :11
## test
## negative:500
## positive:268
##
##
##
##
##
hist(pima$diastolic)
truehist(pima$diastolic) # normalizovani histogram
plot(density(pima$diastolic,na.rm = TRUE))
plot(sort(pima$diastolic),pch = ".")
plot(diabetes ~ diastolic, data = pima)
plot()
, dobijaju se “boxplotovi” (plot
je genericka funkcija cije ponasanje zavisi od konteksta):plot(diabetes ~ test, data = pima)
pairs(pima)
#izdvajamo numericke podatke
numericalVars <- select_if(pima, is.numeric)
# Racunamo korelacionu matricu za numericke podatke, parametar use = use = "complete.obs"
# znaci da koristimo iskljucivo vrste (opservacije) bez "NA"
corrMat <- cor(numericalVars, use = "complete.obs")
# Generisemo "correlation ellipse plot"
corrplot(corrMat, method = "ellipse")