# ~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~ CRP 241| Module 2 Day 3 ~
# ~ Class Exercise ~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~
# We want to examine the characteristics of a small dataset (dset1)
# containing 8 variables The code below will download and load
# the dataset into your current RStudio session
# Download and load the data:
download.file("http://www.duke.edu/~sgrambow/crp241data/dset1.RData",
destfile = "dset1.RData",quiet=TRUE,mode="wb",cacheOK=FALSE)
load("dset1.RData")
# QUESTION 1:
# Check: Was the data read in correctly?
# - How many observations are in the dataset? How many variables?
# SOLUTION
# Use str function to check data frame attributes
# 11 observations of 8 variables, all of which are numeric
str(dset1)
## 'data.frame': 11 obs. of 8 variables:
## $ x1: num 10 8 13 9 11 14 6 4 12 7 ...
## $ x2: num 10 8 13 9 11 14 6 4 12 7 ...
## $ x3: num 10 8 13 9 11 14 6 4 12 7 ...
## $ x4: num 8 8 8 8 8 8 8 19 8 8 ...
## $ y1: num 8.04 6.95 7.58 8.81 8.33 ...
## $ y2: num 9.14 8.14 8.74 8.77 9.26 8.1 6.13 3.1 9.13 7.26 ...
## $ y3: num 7.46 6.77 12.74 7.11 7.81 ...
## $ y4: num 6.58 5.76 7.71 8.84 8.47 7.04 5.25 12.5 5.56 7.91 ...
# INTERPRETATION
# based on this, we suspect that x1,x2,x3 are all very similar and
# potentially identical
# QUESTION 2:
# Examine the raw data values -- anything unusual?
# SOLUTION
# Use View function to visually inspect raw values
View(dset1)
# INTERPRETATION
# visual inspection confirms that x1,x2,x3 are identical
# also reveals unusual structure of x4 which is comprised of all 8's
# except for a single value of 19.
# QUESTION 3:
# Use the numerical and graphical summaries we have discussed
# to examine the dataset and list at least five interesting
# facts about the data.
# SOLUTION
# try boxplots for all variables
boxplot(dset1$x1,dset1$x2,dset1$x3, dset1$x4,dset1$y1,dset1$y2,dset1$y3,dset1$y4)

# INTERPRETATION
# Boxplots like View above show that x1,x2,x3 have identical distributions.
# Also illustrate strange stucture of x4
# Can also see that y3 and y4 have very similar features and distributions
summary(dset1)
## x1 x2 x3 x4
## Min. : 4.0 Min. : 4.0 Min. : 4.0 Min. : 8
## 1st Qu.: 6.5 1st Qu.: 6.5 1st Qu.: 6.5 1st Qu.: 8
## Median : 9.0 Median : 9.0 Median : 9.0 Median : 8
## Mean : 9.0 Mean : 9.0 Mean : 9.0 Mean : 9
## 3rd Qu.:11.5 3rd Qu.:11.5 3rd Qu.:11.5 3rd Qu.: 8
## Max. :14.0 Max. :14.0 Max. :14.0 Max. :19
## y1 y2 y3 y4
## Min. : 4.260 Min. :3.100 Min. : 5.39 Min. : 5.250
## 1st Qu.: 6.315 1st Qu.:6.695 1st Qu.: 6.25 1st Qu.: 6.170
## Median : 7.580 Median :8.140 Median : 7.11 Median : 7.040
## Mean : 7.501 Mean :7.501 Mean : 7.50 Mean : 7.501
## 3rd Qu.: 8.570 3rd Qu.:8.950 3rd Qu.: 7.98 3rd Qu.: 8.190
## Max. :10.840 Max. :9.260 Max. :12.74 Max. :12.500
# INTERPRETATION
# This shows us that x1-x4 all have the same mean of 9
# Can also see that y1-y4 all have the same mean of 7.5 but
# otherwise appear to have different summary stats
# look at relationship/association between each pair of
# x and y variables: (x1,y1), (x2,y2), (x3,y3), (x4,y4)
# let's put all plots on same figure using the
# par function
# see here on quick-r page: https://www.statmethods.net/advgraphs/layout.html
# command below will create
# 4 figures arranged in 2 rows and 2 columns
# 1st par command sets up 2 rows and 2 columns format
# 2nd par command resets it back to 1 row and 1 column
par(mfrow=c(2,2))
plot(dset1$x1,dset1$y1)
plot(dset1$x2,dset1$y2)
plot(dset1$x3,dset1$y4)
plot(dset1$x4,dset1$y4)

par(mfrow=c(1,1))
# HERE ARE SOME INTERESTING FACTS YOU MAY HAVE COME UP WITH
# (1) x1, x2, x3 are all identical
# (2) x4 has 10 identical values of 8 and the 11th value is 19
# (3) x1, x2, x3, x4 all have the same mean of 9
# (4) y1, y2, y3, y4 all have the same mean of 7.5
# (5) Relationship between each x,y pair is different
# (6) No missing data
# (7)
## END PROGRAM ##