# Week 1 Demonstration R Code for PREDICT 401 
# This program will demonstrate various data displays using RStudio and base R.
#-------------------------------------------------------------------------------

tumor <- read.csv(file.path("~/Desktop/MSDS401/", "tumor.csv"), sep=",")

# The tumor data are from 205 patients in Denmark with malignant melanoma. 
# The data frame has 205 rows and 5 variables. The variables are:

# status    "died" died from melanoma, "alive" alive, "other" died other causes
# sex       "male", "female"
# age       age in years
# thickness tumour thickness in mm
# ulcer     "present", "absent"

#-------------------------------------------------------------------------------
# Check the structure of the data set.  Calculate some simple statistics.

str(tumor)
## 'data.frame':    205 obs. of  5 variables:
##  $ status   : chr  "other" "other" "alive" "other" ...
##  $ sex      : chr  "male" "male" "male" "female" ...
##  $ age      : int  76 56 41 71 52 28 77 60 49 68 ...
##  $ thickness: num  6.76 0.65 1.34 2.9 12.08 ...
##  $ ulcer    : chr  "present" "absent" "absent" "absent" ...
head(tumor, n = 5L)
##   status    sex age thickness   ulcer
## 1  other   male  76      6.76 present
## 2  other   male  56      0.65  absent
## 3  alive   male  41      1.34  absent
## 4  other female  71      2.90  absent
## 5   died   male  52     12.08 present
summary(tumor)
##     status              sex                 age          thickness    
##  Length:205         Length:205         Min.   : 4.00   Min.   : 0.10  
##  Class :character   Class :character   1st Qu.:42.00   1st Qu.: 0.97  
##  Mode  :character   Mode  :character   Median :54.00   Median : 1.94  
##                                        Mean   :52.46   Mean   : 2.92  
##                                        3rd Qu.:65.00   3rd Qu.: 3.56  
##                                        Max.   :95.00   Max.   :17.42  
##     ulcer          
##  Length:205        
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
# Qualitative Data Displays of Individual Variables
#-------------------------------------------------------------------------------
# A pie chart is a circular display of data where the area represents 100% of the
# data, and each slice the corresponding percent breakdown.  Form a pie chart
# showing the breakdown of patient status.

patient.status <- table(tumor$status)  # This is a simple frequency table.
patient.status
## 
## alive  died other 
##   134    57    14
pie(patient.status)

# Plot a pie chart based on age classification.  This demonstrates how a ratio
# variable can be converted into an ordinal variable using age classes.

classes <- c(0,20,40,60,80,100)  # This defines the age class boundaries.
result <- cut(tumor$age, breaks = classes, right = FALSE)
result
##   [1] [60,80)  [40,60)  [40,60)  [60,80)  [40,60)  [20,40)  [60,80)  [60,80) 
##   [9] [40,60)  [60,80)  [40,60)  [60,80)  [60,80)  [60,80)  [0,20)   [60,80) 
##  [17] [40,60)  [60,80)  [80,100) [40,60)  [80,100) [20,40)  [20,40)  [40,60) 
##  [25] [60,80)  [60,80)  [80,100) [40,60)  [0,20)   [40,60)  [60,80)  [40,60) 
##  [33] [40,60)  [60,80)  [60,80)  [60,80)  [0,20)   [60,80)  [40,60)  [40,60) 
##  [41] [40,60)  [60,80)  [60,80)  [60,80)  [20,40)  [60,80)  [40,60)  [60,80) 
##  [49] [60,80)  [60,80)  [80,100) [60,80)  [60,80)  [60,80)  [20,40)  [40,60) 
##  [57] [40,60)  [20,40)  [20,40)  [40,60)  [40,60)  [60,80)  [40,60)  [40,60) 
##  [65] [40,60)  [40,60)  [20,40)  [60,80)  [40,60)  [40,60)  [20,40)  [80,100)
##  [73] [40,60)  [0,20)   [40,60)  [40,60)  [40,60)  [40,60)  [20,40)  [40,60) 
##  [81] [40,60)  [40,60)  [40,60)  [60,80)  [40,60)  [40,60)  [60,80)  [40,60) 
##  [89] [60,80)  [40,60)  [40,60)  [60,80)  [80,100) [60,80)  [40,60)  [60,80) 
##  [97] [20,40)  [40,60)  [20,40)  [60,80)  [60,80)  [80,100) [60,80)  [40,60) 
## [105] [60,80)  [20,40)  [40,60)  [40,60)  [20,40)  [60,80)  [60,80)  [40,60) 
## [113] [40,60)  [60,80)  [20,40)  [40,60)  [60,80)  [40,60)  [40,60)  [20,40) 
## [121] [40,60)  [60,80)  [60,80)  [40,60)  [60,80)  [40,60)  [60,80)  [60,80) 
## [129] [60,80)  [40,60)  [40,60)  [60,80)  [60,80)  [60,80)  [40,60)  [40,60) 
## [137] [40,60)  [40,60)  [60,80)  [40,60)  [60,80)  [40,60)  [20,40)  [40,60) 
## [145] [20,40)  [40,60)  [20,40)  [40,60)  [60,80)  [60,80)  [60,80)  [20,40) 
## [153] [60,80)  [40,60)  [20,40)  [60,80)  [40,60)  [60,80)  [20,40)  [40,60) 
## [161] [20,40)  [40,60)  [60,80)  [20,40)  [40,60)  [20,40)  [40,60)  [60,80) 
## [169] [20,40)  [60,80)  [60,80)  [40,60)  [60,80)  [0,20)   [60,80)  [40,60) 
## [177] [20,40)  [40,60)  [20,40)  [40,60)  [60,80)  [40,60)  [40,60)  [20,40) 
## [185] [40,60)  [0,20)   [20,40)  [40,60)  [40,60)  [20,40)  [40,60)  [40,60) 
## [193] [60,80)  [40,60)  [40,60)  [20,40)  [20,40)  [40,60)  [20,40)  [0,20)  
## [201] [20,40)  [40,60)  [40,60)  [40,60)  [40,60) 
## Levels: [0,20) [20,40) [40,60) [60,80) [80,100)
# Note that age class is an ordinal variable.  Each observation is identified by
# its respective age class.  The age classes have an implicit order: 
# [0, 20), [20, 40), [40, 60), and so forth.

count <- table(result)
pie(count)

# The labels and colors can be changed and a title added.

lbls <- c("Class 1","Class 2","Class 3","Class 4","Class 5")
colors = c("red", "yellow", "green", "violet", "orange") 
pie(count, labels = lbls, col = colors, main = "Pie Chart of Age Distribution")

#-------------------------------------------------------------------------------
# Bar graphs are used to display qualitative data with the catgories on one axis,
# and the magnitude of some quantity, like frequency, forming the length of a bar.

# Form vertical and horizontal bar plots showing age distribution frequencies.

barplot(count) 

# The generic display can be enhanced.

colors = c("red", "yellow", "green", "violet", "orange") 
barplot(count, col=colors, main = "Bar Plot of Age Distribution",
        names.arg = c("[0,20)","[20,40)","[40,60)","[60,80)","[80,100)"),
        xlab = "Age Class", ylab = "Frequency")

barplot(count, col=colors, main = "Bar Plot of Age Distribution",
        names.arg = c("Class 1","Class 2","Class 3","Class 4","Class 5"),
        xlab = "Age Class", ylab = "Frequency", horiz = TRUE)

#-------------------------------------------------------------------------------
# Form stacked and adjacent bar plots with colors and legend.

# Bar plots are useful when dealing with nominal and ordinal variables.

counts <- table(tumor$sex, tumor$status)
barplot(counts, main="Patient Distribution by Status and Sex", ylab = "Frequency",
        xlab="Status", col=c("darkblue","red"),legend = rownames(counts))

# result <- cut(tumor$age, breaks = classes, right = FALSE) was defined earlier.

counts <- table(result, tumor$sex)
counts <- t(counts)  #This organizes the data for vertical stacking.
counts
##         result
##          [0,20) [20,40) [40,60) [60,80) [80,100)
##   female      4      23      56      41        2
##   male        3      12      32      27        5
barplot(counts, main="Patient Distribution by Age and Sex", ylab = "Frequency",
        xlab="Age Class", col=c("darkblue","red"),legend = rownames(counts))

barplot(counts, main="Patient Distribution by Age and Sex", ylab = "Frequency",
        xlab="Age Class", col=c("darkblue","red"),legend = rownames(counts),
        names.arg = c("Class 1","Class 2","Class 3","Class 4","Class 5"), beside = TRUE)   

#--------------------------------------------------------------------------------
# A pareto chart is a particular application of a bar graph in which the categories
# are ranked in order of occurrence. The previous bar plot turns out to be similar 
# to a pareto chart.  To generate a pareto chart, the package "qcc" needs to be 
# installed and called so that the function "pareto.chart()" can be used.

library(qcc)
## Package 'qcc' version 2.7
## Type 'citation("qcc")' for citing this R package in publications.
patient.status <- table(tumor$status)
pareto.chart(patient.status)

##        
## Pareto chart analysis for patient.status
##          Frequency  Cum.Freq. Percentage Cum.Percent.
##   alive 134.000000 134.000000  65.365854    65.365854
##   died   57.000000 191.000000  27.804878    93.170732
##   other  14.000000 205.000000   6.829268   100.000000
age.distribution <- count
pareto.chart(age.distribution)

##           
## Pareto chart analysis for age.distribution
##             Frequency  Cum.Freq. Percentage Cum.Percent.
##   [40,60)   88.000000  88.000000  42.926829    42.926829
##   [60,80)   68.000000 156.000000  33.170732    76.097561
##   [20,40)   35.000000 191.000000  17.073171    93.170732
##   [0,20)     7.000000 198.000000   3.414634    96.585366
##   [80,100)   7.000000 205.000000   3.414634   100.000000
#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
# Quantitative data displays of individual variables
#-------------------------------------------------------------------------------
# Histograms show the frequency of data in given class intervals.  The shape of 
# data distribution is revealed for an initial overview.

# Form histograms showing age distribution.  Use the same class boundaries and 
# colors.  Here age is treated as a ratio variable without prior classification.

hist(tumor$age)

hist(tumor$age, breaks = classes, main = "Histogram Showing Age Distribution",
     xlab = "Age Class", col = colors)        

# A frequency polygon is similar to a histogram displaying class frequencies
# versus class midpoint.  Form a display using the vector "count" defined earlier.

center <- c(10, 30, 50, 70, 90)  # Class interval midpoints
plot(center, count)

plot(center, count, type = "b", col = "red", main = "Frequency Polygon",
     xlab = "Age Class Midpoints", ylab = "Frequency", xlim = c(0, 100))

# An ogive is a cumulative frequency polygon.  Steep slopes identify where sharp
# increases in frequencies occur.  Construct an ogive of age distribution defined 
# earlier.  (The following calculation could be done using a "for" loop.)  

cum.count <- numeric(0)   # This defines cum.count as a numeric vector.

cum.count[1] <- count[1]
cum.count[2] <- cum.count[1]+ count[2]
cum.count[3] <- cum.count[2] + count[3]
cum.count[4] <- cum.count[3] + count[4]
cum.count[5] <- cum.count[4] + count[5]
cum.count
## [1]   7  42 130 198 205
n <- cum.count[5]

plot(center, cum.count, type = "b", col = "red", main = "Ogive of Age Distribution",
     xlab = "Age Class Midpoints", ylab = "Cumulative Frequency", xlim = c(0, 100))

plot(center, cum.count/n, type = "b", col = "red", xlab = "Age Class Midpoints", 
     main = "Ogive of Cumulative Relative Frequencies", xlim = c(0, 100), 
     ylab = "Cumulative Relative Frequency")

#-------------------------------------------------------------------------------
# A stem-and-leaf plot is sometimes used to give a unique view of the data.  The
# left most digits of a variable form the stem, and the right most form the leaf.
# R will round off the leaf entries to single digits.

sort(tumor$thickness, decreasing = FALSE)
##   [1]  0.10  0.16  0.16  0.16  0.16  0.16  0.16  0.16  0.24  0.32  0.32  0.32
##  [13]  0.32  0.32  0.32  0.48  0.48  0.48  0.48  0.58  0.64  0.64  0.64  0.64
##  [25]  0.65  0.65  0.65  0.65  0.65  0.65  0.65  0.65  0.65  0.65  0.81  0.81
##  [37]  0.81  0.81  0.81  0.81  0.81  0.81  0.81  0.81  0.81  0.97  0.97  0.97
##  [49]  0.97  0.97  0.97  0.97  0.97  0.97  0.97  0.97  1.03  1.13  1.13  1.13
##  [61]  1.13  1.29  1.29  1.29  1.29  1.29  1.29  1.29  1.29  1.29  1.29  1.29
##  [73]  1.29  1.29  1.29  1.29  1.29  1.34  1.34  1.37  1.45  1.45  1.45  1.53
##  [85]  1.62  1.62  1.62  1.62  1.62  1.62  1.62  1.62  1.62  1.62  1.62  1.62
##  [97]  1.76  1.78  1.78  1.94  1.94  1.94  1.94  1.94  1.94  1.94  1.94  1.94
## [109]  1.94  2.10  2.10  2.10  2.24  2.26  2.26  2.26  2.26  2.26  2.34  2.42
## [121]  2.58  2.58  2.58  2.58  2.58  2.58  2.58  2.58  2.58  2.74  2.90  2.90
## [133]  2.90  3.06  3.06  3.22  3.22  3.22  3.22  3.22  3.22  3.22  3.22  3.22
## [145]  3.22  3.54  3.54  3.54  3.54  3.54  3.54  3.54  3.54  3.56  3.87  3.87
## [157]  3.87  3.87  3.87  3.87  4.04  4.09  4.19  4.19  4.51  4.82  4.83  4.83
## [169]  4.84  4.84  4.84  4.84  4.84  5.16  5.16  5.16  5.48  5.48  5.64  5.80
## [181]  5.80  6.12  6.12  6.44  6.76  7.06  7.06  7.09  7.09  7.41  7.73  7.73
## [193]  7.89  8.06  8.38  8.54  9.66 12.08 12.24 12.56 12.88 12.88 13.85 14.66
## [205] 17.42
# Note how the stem-and-leaf plot rounds the original data prior to plotting.

stem(tumor$thickness)
## 
##   The decimal point is at the |
## 
##    0 | 122222222333333555566666777777777788888888888
##    1 | 0000000000001111333333333333333333455556666666666668889999999999
##    2 | 111233333346666666667999
##    3 | 112222222222555555556999999
##    4 | 0122588888888
##    5 | 22255688
##    6 | 1148
##    7 | 11114779
##    8 | 145
##    9 | 7
##   10 | 
##   11 | 
##   12 | 12699
##   13 | 9
##   14 | 7
##   15 | 
##   16 | 
##   17 | 4
# A scale parameter can be used with this display.

stem(tumor$thickness, scale = 1)
## 
##   The decimal point is at the |
## 
##    0 | 122222222333333555566666777777777788888888888
##    1 | 0000000000001111333333333333333333455556666666666668889999999999
##    2 | 111233333346666666667999
##    3 | 112222222222555555556999999
##    4 | 0122588888888
##    5 | 22255688
##    6 | 1148
##    7 | 11114779
##    8 | 145
##    9 | 7
##   10 | 
##   11 | 
##   12 | 12699
##   13 | 9
##   14 | 7
##   15 | 
##   16 | 
##   17 | 4
#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
# Look at two variables simultaneously.
#-------------------------------------------------------------------------------
# Calculate various summary statistics such as the mean, standard deviation, etc.
# R provides a variety of functions which can be used with tables and lists.

tapply(tumor$thickness, list(tumor$sex, tumor$ulcer), FUN = mean)
##          absent  present
## female 1.737342 3.745532
## male   1.973611 4.982093
tapply(tumor$thickness, list(tumor$sex, tumor$ulcer), FUN = sd)
##          absent  present
## female 1.904453 3.447580
## male   2.727677 2.841065
tapply(tumor$thickness, list(tumor$status, tumor$ulcer), FUN = length)
##       absent present
## alive     92      42
## died      16      41
## other      7       7
#-------------------------------------------------------------------------------
# Form a simple contingency table.  Contingency tables provide a way to evaluate
# the relationship between two qualitative variables, in this case sex and status.

mytable <- (table(tumor$sex,tumor$status))
mytable
##         
##          alive died other
##   female    91   28     7
##   male      43   29     7
# Convert the cells counts in this table to proportions.
n <- sum(mytable)
n
## [1] 205
mytable/n
##         
##               alive       died      other
##   female 0.44390244 0.13658537 0.03414634
##   male   0.20975610 0.14146341 0.03414634
prop.table(mytable)  # This is a quicker way to obtain proportions.
##         
##               alive       died      other
##   female 0.44390244 0.13658537 0.03414634
##   male   0.20975610 0.14146341 0.03414634
addmargins(mytable)
##         
##          alive died other Sum
##   female    91   28     7 126
##   male      43   29     7  79
##   Sum      134   57    14 205
prop.table(mytable, 1) # row proportions
##         
##               alive       died      other
##   female 0.72222222 0.22222222 0.05555556
##   male   0.54430380 0.36708861 0.08860759
prop.table(mytable, 2) # column proportions
##         
##              alive      died     other
##   female 0.6791045 0.4912281 0.5000000
##   male   0.3208955 0.5087719 0.5000000
addmargins(table(tumor$sex,tumor$status,tumor$ulcer))
## , ,  = absent
## 
##         
##          alive died other Sum
##   female    68    8     3  79
##   male      24    8     4  36
##   Sum       92   16     7 115
## 
## , ,  = present
## 
##         
##          alive died other Sum
##   female    23   20     4  47
##   male      19   21     3  43
##   Sum       42   41     7  90
## 
## , ,  = Sum
## 
##         
##          alive died other Sum
##   female    91   28     7 126
##   male      43   29     7  79
##   Sum      134   57    14 205
#-------------------------------------------------------------------------------
# Scatterplots are useful for displaying the relationship between two numerical
# variables.  Form a scatterplot of tumor thickness versus age.

plot(tumor$age, tumor$thickness)

plot(tumor$age, tumor$thickness, main = "Tumor Thickness versus Age", col = "red",
     cex = 1.0, pch = 16,  xlab = "Age", ylab = "Thickness")

#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
# new plot graph for the Tumor Thickness vs Age.
# See the changes of code below:
plot(tumor$age, tumor$thickness, 
     main = "Tumor Thickness vs. Age", 
     col = "blue", 
     pch = 19, 
     cex = 1.2, 
     xlab = "Patient Age (Years)", 
     ylab = "Tumor Thickness (mm)",
     xlim = c(0, 100),
     ylim = c(0, max(tumor$thickness) + 1))

grid()  # Adds a background grid for better readability
abline(lm(tumor$thickness ~ tumor$age), col = "red", lwd = 2)  # Adds a regression line
legend("topright", legend = c("Data Points", "Trend Line"), 
       col = c("blue", "red"), pch = c(19, NA), lty = c(NA, 1), lwd = c(NA, 2))

#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------