Data Cleaning Final Project

This dataset is originally from the National Institute of Diabetes and Digestive and Kidney Diseases. The objective of the dataset is to diagnostically predict whether or not a patient has diabetes, based on certain diagnostic measurements included in the dataset. The data set is comprised of 768 observations and 9 variables. It is available in the package mlbench. We will be using diabetes as our response/target variable.

Data Description for the 9 variables are as follows.

pregnant - Number of times pregnant.
glucose - Plasma glucose concentration (glucose tolerance test)
pressure - Diastolic blood pressure (mm Hg)
triceps - Triceps skin fold thickness (mm)
insulin - 2-Hour serum insulin (mu U/ml)
mass - Body mass index (weight in kg/(height in m)^2)
pedigree - Diabetes pedigree function
age - Age (years)
diabetes - Class variable (test for diabetes)

More information on the dataset can be read by accessing the website: http://math.furman.edu/~dcs/courses/math47/R/library/mlbench/html/PimaIndiansDiabetes.html

data(PimaIndiansDiabetes)
df <- PimaIndiansDiabetes
#dimensions of data set
dim(df)
## [1] 768   9
#columns and column type
str(df)
## 'data.frame':    768 obs. of  9 variables:
##  $ pregnant: num  6 1 8 1 0 5 3 10 2 8 ...
##  $ glucose : num  148 85 183 89 137 116 78 115 197 125 ...
##  $ pressure: num  72 66 64 66 40 74 50 0 70 96 ...
##  $ triceps : num  35 29 0 23 35 0 32 0 45 0 ...
##  $ insulin : num  0 0 0 94 168 0 88 0 543 0 ...
##  $ mass    : num  33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
##  $ pedigree: num  0.627 0.351 0.672 0.167 2.288 ...
##  $ age     : num  50 31 32 21 33 30 26 29 53 54 ...
##  $ diabetes: Factor w/ 2 levels "neg","pos": 2 1 2 1 2 1 2 1 2 2 ...
#stats for each column
summary(df)
##     pregnant         glucose         pressure         triceps     
##  Min.   : 0.000   Min.   :  0.0   Min.   :  0.00   Min.   : 0.00  
##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 62.00   1st Qu.: 0.00  
##  Median : 3.000   Median :117.0   Median : 72.00   Median :23.00  
##  Mean   : 3.845   Mean   :120.9   Mean   : 69.11   Mean   :20.54  
##  3rd Qu.: 6.000   3rd Qu.:140.2   3rd Qu.: 80.00   3rd Qu.:32.00  
##  Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00  
##     insulin           mass          pedigree           age        diabetes 
##  Min.   :  0.0   Min.   : 0.00   Min.   :0.0780   Min.   :21.00   neg:500  
##  1st Qu.:  0.0   1st Qu.:27.30   1st Qu.:0.2437   1st Qu.:24.00   pos:268  
##  Median : 30.5   Median :32.00   Median :0.3725   Median :29.00            
##  Mean   : 79.8   Mean   :31.99   Mean   :0.4719   Mean   :33.24            
##  3rd Qu.:127.2   3rd Qu.:36.60   3rd Qu.:0.6262   3rd Qu.:41.00            
##  Max.   :846.0   Max.   :67.10   Max.   :2.4200   Max.   :81.00

Correlations

To get good correlations the diabetes column needs to be a numeric. We can see there is a strong correlation between glucose level and diabetes (2=pos, 1=neg)

#convert the diabetes column to numeric
dfNum <- df;
dfNum$diabetes<-as.numeric(dfNum$diabetes)
#correlations
cor(dfNum)
##             pregnant    glucose   pressure     triceps     insulin       mass
## pregnant  1.00000000 0.12945867 0.14128198 -0.08167177 -0.07353461 0.01768309
## glucose   0.12945867 1.00000000 0.15258959  0.05732789  0.33135711 0.22107107
## pressure  0.14128198 0.15258959 1.00000000  0.20737054  0.08893338 0.28180529
## triceps  -0.08167177 0.05732789 0.20737054  1.00000000  0.43678257 0.39257320
## insulin  -0.07353461 0.33135711 0.08893338  0.43678257  1.00000000 0.19785906
## mass      0.01768309 0.22107107 0.28180529  0.39257320  0.19785906 1.00000000
## pedigree -0.03352267 0.13733730 0.04126495  0.18392757  0.18507093 0.14064695
## age       0.54434123 0.26351432 0.23952795 -0.11397026 -0.04216295 0.03624187
## diabetes  0.22189815 0.46658140 0.06506836  0.07475223  0.13054795 0.29269466
##             pedigree         age   diabetes
## pregnant -0.03352267  0.54434123 0.22189815
## glucose   0.13733730  0.26351432 0.46658140
## pressure  0.04126495  0.23952795 0.06506836
## triceps   0.18392757 -0.11397026 0.07475223
## insulin   0.18507093 -0.04216295 0.13054795
## mass      0.14064695  0.03624187 0.29269466
## pedigree  1.00000000  0.03356131 0.17384407
## age       0.03356131  1.00000000 0.23835598
## diabetes  0.17384407  0.23835598 1.00000000
corrplot::corrplot(cor(dfNum),type = "lower", method = "number")

Factor the columns and run apriori

Categories from the US CDC:

If your BMI (body mass index) is less than 18.5, underweight range
If your BMI is 18.5 to < 25, healthy weight range
If your BMI is 25.0 to < 30, overweight range
If your BMI is 30.0 or higher, obesity range

dfFactorized <- df
dfFactorized$pregnant <- factor(dfFactorized$pregnant)
dfFactorized$glucose <- cut(dfFactorized$glucose, breaks = 3, labels = c("low","med","high"))
dfFactorized$pressure <- cut(dfFactorized$pressure, breaks = 3, labels = c("low","med","high"))
dfFactorized$triceps <- cut(dfFactorized$triceps, breaks = 5, labels = c("very low","low","med","high","very high"))
dfFactorized$insulin <- cut(dfFactorized$insulin, breaks = 3, labels = c("low","med","high"))
dfFactorized$mass <- cut(dfFactorized$mass, c(-100, 18.5, 25, 30, 100), labels = c("underweight", "healthy", "overweight", "obese"))
dfFactorized$pedigree <- cut(dfFactorized$pedigree, breaks = 3, labels = c("low","med","high"))
dfFactorized$age <- cut(dfFactorized$age, c(0,20,30,40,50,60,100))
summary(dfFactorized)
##     pregnant   glucose    pressure        triceps    insulin   
##  1      :135   low : 12   low : 40   very low :338   low :724  
##  0      :111   med :515   med :563   low      :340   med : 39  
##  2      :103   high:241   high:165   med      : 87   high:  5  
##  3      : 75                         high     :  2             
##  4      : 68                         very high:  1             
##  5      : 57                                                   
##  (Other):219                                                   
##           mass     pedigree         age      diabetes 
##  underweight: 15   low :685   (0,20]  :  0   neg:500  
##  healthy    :108   med : 74   (20,30] :417   pos:268  
##  overweight :180   high:  9   (30,40] :157            
##  obese      :465              (40,50] :113            
##                               (50,60] : 54            
##                               (60,100]: 27            
## 

Run Apriori

Now that the columns are factors, we then convert this to transactions data type.

transactions <- as(dfFactorized,"transactions")
summary(transactions)
## transactions as itemMatrix in sparse format with
##  768 rows (elements/itemsets/transactions) and
##  46 columns (items) and a density of 0.1956522 
## 
## most frequent items:
##  insulin=low pedigree=low pressure=med  glucose=med diabetes=neg      (Other) 
##          724          685          563          515          500         3925 
## 
## element (itemset/transaction) length distribution:
## sizes
##   9 
## 768 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       9       9       9       9       9       9 
## 
## includes extended item information - examples:
##       labels variables levels
## 1 pregnant=0  pregnant      0
## 2 pregnant=1  pregnant      1
## 3 pregnant=2  pregnant      2
## 
## includes extended transaction information - examples:
##   transactionID
## 1             1
## 2             2
## 3             3
itemFrequencyPlot(transactions, support=0.2, topN=12)

Apply Apriori Algorithm

Use the apriori algorithm to discover patterns or rules.

ars<- apriori(transactions, parameter=list(support=0.1, confidence=0.5))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.5    0.1    1 none FALSE            TRUE       5     0.1      1
##  maxlen target  ext
##      10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 76 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[45 item(s), 768 transaction(s)] done [0.00s].
## sorting and recoding items ... [20 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 7 done [0.00s].
## writing ... [1639 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
plot(ars)
## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.

Explore the Rule Set

Query the resulting rules set (ars) to find our “interesting” rules.

#filter the set of rules and try to interpret the results
ruleSubset <- subset(ars,subset=rhs %in% "diabetes=pos")
ruleSubset
## set of 12 rules
inspect(head(ruleSubset,10, by="lift"))
##      lhs                rhs              support confidence  coverage     lift count
## [1]  {glucose=high,                                                                 
##       insulin=low,                                                                  
##       mass=obese}    => {diabetes=pos} 0.1406250  0.7552448 0.1861979 2.164283   108
## [2]  {glucose=high,                                                                 
##       insulin=low,                                                                  
##       mass=obese,                                                                   
##       pedigree=low}  => {diabetes=pos} 0.1197917  0.7479675 0.1601563 2.143429    92
## [3]  {glucose=high,                                                                 
##       pressure=med,                                                                 
##       mass=obese}    => {diabetes=pos} 0.1002604  0.7475728 0.1341146 2.142298    77
## [4]  {glucose=high,                                                                 
##       mass=obese,                                                                   
##       pedigree=low}  => {diabetes=pos} 0.1445312  0.7350993 0.1966146 2.106553   111
## [5]  {glucose=high,                                                                 
##       mass=obese}    => {diabetes=pos} 0.1692708  0.7344633 0.2304688 2.104731   130
## [6]  {glucose=high,                                                                 
##       insulin=low}   => {diabetes=pos} 0.1705729  0.6421569 0.2656250 1.840211   131
## [7]  {glucose=high}  => {diabetes=pos} 0.2005208  0.6390041 0.3138021 1.831176   154
## [8]  {glucose=high,                                                                 
##       pedigree=low}  => {diabetes=pos} 0.1731771  0.6273585 0.2760417 1.797803   133
## [9]  {glucose=high,                                                                 
##       pressure=med,                                                                 
##       insulin=low}   => {diabetes=pos} 0.1028646  0.6269841 0.1640625 1.796731    79
## [10] {glucose=high,                                                                 
##       pressure=med}  => {diabetes=pos} 0.1223958  0.6266667 0.1953125 1.795821    94