# Week 1 Demonstration R Code for PREDICT 401
# This program will demonstrate various data displays using RStudio and base R.
#-------------------------------------------------------------------------------
tumor <- read.csv(file.path("~/Desktop/MSDS401/", "tumor.csv"), sep=",")
# The tumor data are from 205 patients in Denmark with malignant melanoma.
# The data frame has 205 rows and 5 variables. The variables are:
# status "died" died from melanoma, "alive" alive, "other" died other causes
# sex "male", "female"
# age age in years
# thickness tumour thickness in mm
# ulcer "present", "absent"
#-------------------------------------------------------------------------------
# Check the structure of the data set. Calculate some simple statistics.
str(tumor)
## 'data.frame': 205 obs. of 5 variables:
## $ status : chr "other" "other" "alive" "other" ...
## $ sex : chr "male" "male" "male" "female" ...
## $ age : int 76 56 41 71 52 28 77 60 49 68 ...
## $ thickness: num 6.76 0.65 1.34 2.9 12.08 ...
## $ ulcer : chr "present" "absent" "absent" "absent" ...
head(tumor, n = 5L)
## status sex age thickness ulcer
## 1 other male 76 6.76 present
## 2 other male 56 0.65 absent
## 3 alive male 41 1.34 absent
## 4 other female 71 2.90 absent
## 5 died male 52 12.08 present
summary(tumor)
## status sex age thickness
## Length:205 Length:205 Min. : 4.00 Min. : 0.10
## Class :character Class :character 1st Qu.:42.00 1st Qu.: 0.97
## Mode :character Mode :character Median :54.00 Median : 1.94
## Mean :52.46 Mean : 2.92
## 3rd Qu.:65.00 3rd Qu.: 3.56
## Max. :95.00 Max. :17.42
## ulcer
## Length:205
## Class :character
## Mode :character
##
##
##
#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
# Qualitative Data Displays of Individual Variables
#-------------------------------------------------------------------------------
# A pie chart is a circular display of data where the area represents 100% of the
# data, and each slice the corresponding percent breakdown. Form a pie chart
# showing the breakdown of patient status.
patient.status <- table(tumor$status) # This is a simple frequency table.
patient.status
##
## alive died other
## 134 57 14
pie(patient.status)

# Plot a pie chart based on age classification. This demonstrates how a ratio
# variable can be converted into an ordinal variable using age classes.
classes <- c(0,20,40,60,80,100) # This defines the age class boundaries.
result <- cut(tumor$age, breaks = classes, right = FALSE)
result
## [1] [60,80) [40,60) [40,60) [60,80) [40,60) [20,40) [60,80) [60,80)
## [9] [40,60) [60,80) [40,60) [60,80) [60,80) [60,80) [0,20) [60,80)
## [17] [40,60) [60,80) [80,100) [40,60) [80,100) [20,40) [20,40) [40,60)
## [25] [60,80) [60,80) [80,100) [40,60) [0,20) [40,60) [60,80) [40,60)
## [33] [40,60) [60,80) [60,80) [60,80) [0,20) [60,80) [40,60) [40,60)
## [41] [40,60) [60,80) [60,80) [60,80) [20,40) [60,80) [40,60) [60,80)
## [49] [60,80) [60,80) [80,100) [60,80) [60,80) [60,80) [20,40) [40,60)
## [57] [40,60) [20,40) [20,40) [40,60) [40,60) [60,80) [40,60) [40,60)
## [65] [40,60) [40,60) [20,40) [60,80) [40,60) [40,60) [20,40) [80,100)
## [73] [40,60) [0,20) [40,60) [40,60) [40,60) [40,60) [20,40) [40,60)
## [81] [40,60) [40,60) [40,60) [60,80) [40,60) [40,60) [60,80) [40,60)
## [89] [60,80) [40,60) [40,60) [60,80) [80,100) [60,80) [40,60) [60,80)
## [97] [20,40) [40,60) [20,40) [60,80) [60,80) [80,100) [60,80) [40,60)
## [105] [60,80) [20,40) [40,60) [40,60) [20,40) [60,80) [60,80) [40,60)
## [113] [40,60) [60,80) [20,40) [40,60) [60,80) [40,60) [40,60) [20,40)
## [121] [40,60) [60,80) [60,80) [40,60) [60,80) [40,60) [60,80) [60,80)
## [129] [60,80) [40,60) [40,60) [60,80) [60,80) [60,80) [40,60) [40,60)
## [137] [40,60) [40,60) [60,80) [40,60) [60,80) [40,60) [20,40) [40,60)
## [145] [20,40) [40,60) [20,40) [40,60) [60,80) [60,80) [60,80) [20,40)
## [153] [60,80) [40,60) [20,40) [60,80) [40,60) [60,80) [20,40) [40,60)
## [161] [20,40) [40,60) [60,80) [20,40) [40,60) [20,40) [40,60) [60,80)
## [169] [20,40) [60,80) [60,80) [40,60) [60,80) [0,20) [60,80) [40,60)
## [177] [20,40) [40,60) [20,40) [40,60) [60,80) [40,60) [40,60) [20,40)
## [185] [40,60) [0,20) [20,40) [40,60) [40,60) [20,40) [40,60) [40,60)
## [193] [60,80) [40,60) [40,60) [20,40) [20,40) [40,60) [20,40) [0,20)
## [201] [20,40) [40,60) [40,60) [40,60) [40,60)
## Levels: [0,20) [20,40) [40,60) [60,80) [80,100)
# Note that age class is an ordinal variable. Each observation is identified by
# its respective age class. The age classes have an implicit order:
# [0, 20), [20, 40), [40, 60), and so forth.
count <- table(result)
pie(count)

# The labels and colors can be changed and a title added.
lbls <- c("Class 1","Class 2","Class 3","Class 4","Class 5")
colors = c("red", "yellow", "green", "violet", "orange")
pie(count, labels = lbls, col = colors, main = "Pie Chart of Age Distribution")

#-------------------------------------------------------------------------------
# Bar graphs are used to display qualitative data with the catgories on one axis,
# and the magnitude of some quantity, like frequency, forming the length of a bar.
# Form vertical and horizontal bar plots showing age distribution frequencies.
barplot(count)

# The generic display can be enhanced.
colors = c("red", "yellow", "green", "violet", "orange")
barplot(count, col=colors, main = "Bar Plot of Age Distribution",
names.arg = c("[0,20)","[20,40)","[40,60)","[60,80)","[80,100)"),
xlab = "Age Class", ylab = "Frequency")

barplot(count, col=colors, main = "Bar Plot of Age Distribution",
names.arg = c("Class 1","Class 2","Class 3","Class 4","Class 5"),
xlab = "Age Class", ylab = "Frequency", horiz = TRUE)

#-------------------------------------------------------------------------------
# Form stacked and adjacent bar plots with colors and legend.
# Bar plots are useful when dealing with nominal and ordinal variables.
counts <- table(tumor$sex, tumor$status)
barplot(counts, main="Patient Distribution by Status and Sex", ylab = "Frequency",
xlab="Status", col=c("darkblue","red"),legend = rownames(counts))

# result <- cut(tumor$age, breaks = classes, right = FALSE) was defined earlier.
counts <- table(result, tumor$sex)
counts <- t(counts) #This organizes the data for vertical stacking.
counts
## result
## [0,20) [20,40) [40,60) [60,80) [80,100)
## female 4 23 56 41 2
## male 3 12 32 27 5
barplot(counts, main="Patient Distribution by Age and Sex", ylab = "Frequency",
xlab="Age Class", col=c("darkblue","red"),legend = rownames(counts))

barplot(counts, main="Patient Distribution by Age and Sex", ylab = "Frequency",
xlab="Age Class", col=c("darkblue","red"),legend = rownames(counts),
names.arg = c("Class 1","Class 2","Class 3","Class 4","Class 5"), beside = TRUE)

#--------------------------------------------------------------------------------
# A pareto chart is a particular application of a bar graph in which the categories
# are ranked in order of occurrence. The previous bar plot turns out to be similar
# to a pareto chart. To generate a pareto chart, the package "qcc" needs to be
# installed and called so that the function "pareto.chart()" can be used.
library(qcc)
## Package 'qcc' version 2.7
## Type 'citation("qcc")' for citing this R package in publications.
patient.status <- table(tumor$status)
pareto.chart(patient.status)

##
## Pareto chart analysis for patient.status
## Frequency Cum.Freq. Percentage Cum.Percent.
## alive 134.000000 134.000000 65.365854 65.365854
## died 57.000000 191.000000 27.804878 93.170732
## other 14.000000 205.000000 6.829268 100.000000
age.distribution <- count
pareto.chart(age.distribution)

##
## Pareto chart analysis for age.distribution
## Frequency Cum.Freq. Percentage Cum.Percent.
## [40,60) 88.000000 88.000000 42.926829 42.926829
## [60,80) 68.000000 156.000000 33.170732 76.097561
## [20,40) 35.000000 191.000000 17.073171 93.170732
## [0,20) 7.000000 198.000000 3.414634 96.585366
## [80,100) 7.000000 205.000000 3.414634 100.000000
#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
# Quantitative data displays of individual variables
#-------------------------------------------------------------------------------
# Histograms show the frequency of data in given class intervals. The shape of
# data distribution is revealed for an initial overview.
# Form histograms showing age distribution. Use the same class boundaries and
# colors. Here age is treated as a ratio variable without prior classification.
hist(tumor$age)

hist(tumor$age, breaks = classes, main = "Histogram Showing Age Distribution",
xlab = "Age Class", col = colors)

# A frequency polygon is similar to a histogram displaying class frequencies
# versus class midpoint. Form a display using the vector "count" defined earlier.
center <- c(10, 30, 50, 70, 90) # Class interval midpoints
plot(center, count)

plot(center, count, type = "b", col = "red", main = "Frequency Polygon",
xlab = "Age Class Midpoints", ylab = "Frequency", xlim = c(0, 100))

# An ogive is a cumulative frequency polygon. Steep slopes identify where sharp
# increases in frequencies occur. Construct an ogive of age distribution defined
# earlier. (The following calculation could be done using a "for" loop.)
cum.count <- numeric(0) # This defines cum.count as a numeric vector.
cum.count[1] <- count[1]
cum.count[2] <- cum.count[1]+ count[2]
cum.count[3] <- cum.count[2] + count[3]
cum.count[4] <- cum.count[3] + count[4]
cum.count[5] <- cum.count[4] + count[5]
cum.count
## [1] 7 42 130 198 205
n <- cum.count[5]
plot(center, cum.count, type = "b", col = "red", main = "Ogive of Age Distribution",
xlab = "Age Class Midpoints", ylab = "Cumulative Frequency", xlim = c(0, 100))

plot(center, cum.count/n, type = "b", col = "red", xlab = "Age Class Midpoints",
main = "Ogive of Cumulative Relative Frequencies", xlim = c(0, 100),
ylab = "Cumulative Relative Frequency")

#-------------------------------------------------------------------------------
# A stem-and-leaf plot is sometimes used to give a unique view of the data. The
# left most digits of a variable form the stem, and the right most form the leaf.
# R will round off the leaf entries to single digits.
sort(tumor$thickness, decreasing = FALSE)
## [1] 0.10 0.16 0.16 0.16 0.16 0.16 0.16 0.16 0.24 0.32 0.32 0.32
## [13] 0.32 0.32 0.32 0.48 0.48 0.48 0.48 0.58 0.64 0.64 0.64 0.64
## [25] 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.65 0.81 0.81
## [37] 0.81 0.81 0.81 0.81 0.81 0.81 0.81 0.81 0.81 0.97 0.97 0.97
## [49] 0.97 0.97 0.97 0.97 0.97 0.97 0.97 0.97 1.03 1.13 1.13 1.13
## [61] 1.13 1.29 1.29 1.29 1.29 1.29 1.29 1.29 1.29 1.29 1.29 1.29
## [73] 1.29 1.29 1.29 1.29 1.29 1.34 1.34 1.37 1.45 1.45 1.45 1.53
## [85] 1.62 1.62 1.62 1.62 1.62 1.62 1.62 1.62 1.62 1.62 1.62 1.62
## [97] 1.76 1.78 1.78 1.94 1.94 1.94 1.94 1.94 1.94 1.94 1.94 1.94
## [109] 1.94 2.10 2.10 2.10 2.24 2.26 2.26 2.26 2.26 2.26 2.34 2.42
## [121] 2.58 2.58 2.58 2.58 2.58 2.58 2.58 2.58 2.58 2.74 2.90 2.90
## [133] 2.90 3.06 3.06 3.22 3.22 3.22 3.22 3.22 3.22 3.22 3.22 3.22
## [145] 3.22 3.54 3.54 3.54 3.54 3.54 3.54 3.54 3.54 3.56 3.87 3.87
## [157] 3.87 3.87 3.87 3.87 4.04 4.09 4.19 4.19 4.51 4.82 4.83 4.83
## [169] 4.84 4.84 4.84 4.84 4.84 5.16 5.16 5.16 5.48 5.48 5.64 5.80
## [181] 5.80 6.12 6.12 6.44 6.76 7.06 7.06 7.09 7.09 7.41 7.73 7.73
## [193] 7.89 8.06 8.38 8.54 9.66 12.08 12.24 12.56 12.88 12.88 13.85 14.66
## [205] 17.42
# Note how the stem-and-leaf plot rounds the original data prior to plotting.
stem(tumor$thickness)
##
## The decimal point is at the |
##
## 0 | 122222222333333555566666777777777788888888888
## 1 | 0000000000001111333333333333333333455556666666666668889999999999
## 2 | 111233333346666666667999
## 3 | 112222222222555555556999999
## 4 | 0122588888888
## 5 | 22255688
## 6 | 1148
## 7 | 11114779
## 8 | 145
## 9 | 7
## 10 |
## 11 |
## 12 | 12699
## 13 | 9
## 14 | 7
## 15 |
## 16 |
## 17 | 4
# A scale parameter can be used with this display.
stem(tumor$thickness, scale = 1)
##
## The decimal point is at the |
##
## 0 | 122222222333333555566666777777777788888888888
## 1 | 0000000000001111333333333333333333455556666666666668889999999999
## 2 | 111233333346666666667999
## 3 | 112222222222555555556999999
## 4 | 0122588888888
## 5 | 22255688
## 6 | 1148
## 7 | 11114779
## 8 | 145
## 9 | 7
## 10 |
## 11 |
## 12 | 12699
## 13 | 9
## 14 | 7
## 15 |
## 16 |
## 17 | 4
#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
# Look at two variables simultaneously.
#-------------------------------------------------------------------------------
# Calculate various summary statistics such as the mean, standard deviation, etc.
# R provides a variety of functions which can be used with tables and lists.
tapply(tumor$thickness, list(tumor$sex, tumor$ulcer), FUN = mean)
## absent present
## female 1.737342 3.745532
## male 1.973611 4.982093
tapply(tumor$thickness, list(tumor$sex, tumor$ulcer), FUN = sd)
## absent present
## female 1.904453 3.447580
## male 2.727677 2.841065
tapply(tumor$thickness, list(tumor$status, tumor$ulcer), FUN = length)
## absent present
## alive 92 42
## died 16 41
## other 7 7
#-------------------------------------------------------------------------------
# Form a simple contingency table. Contingency tables provide a way to evaluate
# the relationship between two qualitative variables, in this case sex and status.
mytable <- (table(tumor$sex,tumor$status))
mytable
##
## alive died other
## female 91 28 7
## male 43 29 7
# Convert the cells counts in this table to proportions.
n <- sum(mytable)
n
## [1] 205
mytable/n
##
## alive died other
## female 0.44390244 0.13658537 0.03414634
## male 0.20975610 0.14146341 0.03414634
prop.table(mytable) # This is a quicker way to obtain proportions.
##
## alive died other
## female 0.44390244 0.13658537 0.03414634
## male 0.20975610 0.14146341 0.03414634
addmargins(mytable)
##
## alive died other Sum
## female 91 28 7 126
## male 43 29 7 79
## Sum 134 57 14 205
prop.table(mytable, 1) # row proportions
##
## alive died other
## female 0.72222222 0.22222222 0.05555556
## male 0.54430380 0.36708861 0.08860759
prop.table(mytable, 2) # column proportions
##
## alive died other
## female 0.6791045 0.4912281 0.5000000
## male 0.3208955 0.5087719 0.5000000
addmargins(table(tumor$sex,tumor$status,tumor$ulcer))
## , , = absent
##
##
## alive died other Sum
## female 68 8 3 79
## male 24 8 4 36
## Sum 92 16 7 115
##
## , , = present
##
##
## alive died other Sum
## female 23 20 4 47
## male 19 21 3 43
## Sum 42 41 7 90
##
## , , = Sum
##
##
## alive died other Sum
## female 91 28 7 126
## male 43 29 7 79
## Sum 134 57 14 205
#-------------------------------------------------------------------------------
# Scatterplots are useful for displaying the relationship between two numerical
# variables. Form a scatterplot of tumor thickness versus age.
plot(tumor$age, tumor$thickness)

plot(tumor$age, tumor$thickness, main = "Tumor Thickness versus Age", col = "red",
cex = 1.0, pch = 16, xlab = "Age", ylab = "Thickness")

#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
# new plot graph for the Tumor Thickness vs Age.
# See the changes of code below:
plot(tumor$age, tumor$thickness,
main = "Tumor Thickness vs. Age",
col = "blue",
pch = 19,
cex = 1.2,
xlab = "Patient Age (Years)",
ylab = "Tumor Thickness (mm)",
xlim = c(0, 100),
ylim = c(0, max(tumor$thickness) + 1))
grid() # Adds a background grid for better readability
abline(lm(tumor$thickness ~ tumor$age), col = "red", lwd = 2) # Adds a regression line
legend("topright", legend = c("Data Points", "Trend Line"),
col = c("blue", "red"), pch = c(19, NA), lty = c(NA, 1), lwd = c(NA, 2))

#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------