# ~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~ CRP 241| Module 2 Day 3 ~
# ~    Class Exercise       ~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~

# We want to examine the characteristics of a small dataset (dset1)
# containing 8 variables The code below will download and load 
# the dataset into your current RStudio session

# Download and load the data: 
download.file("http://www.duke.edu/~sgrambow/crp241data/dset1.RData",
              destfile = "dset1.RData",quiet=TRUE,mode="wb",cacheOK=FALSE)
load("dset1.RData")

# QUESTION 1:
# Check: Was the data read in correctly?
# - How many observations are in the dataset? How many variables?

# SOLUTION
# Use str function to check data frame attributes
# 11 observations of 8 variables, all of which are numeric
str(dset1)
## 'data.frame':    11 obs. of  8 variables:
##  $ x1: num  10 8 13 9 11 14 6 4 12 7 ...
##  $ x2: num  10 8 13 9 11 14 6 4 12 7 ...
##  $ x3: num  10 8 13 9 11 14 6 4 12 7 ...
##  $ x4: num  8 8 8 8 8 8 8 19 8 8 ...
##  $ y1: num  8.04 6.95 7.58 8.81 8.33 ...
##  $ y2: num  9.14 8.14 8.74 8.77 9.26 8.1 6.13 3.1 9.13 7.26 ...
##  $ y3: num  7.46 6.77 12.74 7.11 7.81 ...
##  $ y4: num  6.58 5.76 7.71 8.84 8.47 7.04 5.25 12.5 5.56 7.91 ...
# INTERPRETATION
# based on this, we suspect that x1,x2,x3 are all very similar and
# potentially identical

# QUESTION 2:
# Examine the raw data values -- anything unusual?

# SOLUTION
# Use View function to visually inspect raw values
View(dset1)

# INTERPRETATION
# visual inspection confirms that x1,x2,x3 are identical
# also reveals unusual structure of x4 which is comprised of all 8's 
# except for a single value of 19.

# QUESTION 3:
# Use the numerical and graphical summaries we have discussed 
# to examine the dataset and list at least five interesting
# facts about the data.

# SOLUTION
# try boxplots for all variables
boxplot(dset1$x1,dset1$x2,dset1$x3, dset1$x4,dset1$y1,dset1$y2,dset1$y3,dset1$y4)

# INTERPRETATION
# Boxplots like View above show that x1,x2,x3 have identical distributions.
# Also illustrate strange stucture of x4
# Can also see that y3 and y4 have very similar features and distributions

summary(dset1)
##        x1             x2             x3             x4    
##  Min.   : 4.0   Min.   : 4.0   Min.   : 4.0   Min.   : 8  
##  1st Qu.: 6.5   1st Qu.: 6.5   1st Qu.: 6.5   1st Qu.: 8  
##  Median : 9.0   Median : 9.0   Median : 9.0   Median : 8  
##  Mean   : 9.0   Mean   : 9.0   Mean   : 9.0   Mean   : 9  
##  3rd Qu.:11.5   3rd Qu.:11.5   3rd Qu.:11.5   3rd Qu.: 8  
##  Max.   :14.0   Max.   :14.0   Max.   :14.0   Max.   :19  
##        y1               y2              y3              y4        
##  Min.   : 4.260   Min.   :3.100   Min.   : 5.39   Min.   : 5.250  
##  1st Qu.: 6.315   1st Qu.:6.695   1st Qu.: 6.25   1st Qu.: 6.170  
##  Median : 7.580   Median :8.140   Median : 7.11   Median : 7.040  
##  Mean   : 7.501   Mean   :7.501   Mean   : 7.50   Mean   : 7.501  
##  3rd Qu.: 8.570   3rd Qu.:8.950   3rd Qu.: 7.98   3rd Qu.: 8.190  
##  Max.   :10.840   Max.   :9.260   Max.   :12.74   Max.   :12.500
# INTERPRETATION
# This shows us that x1-x4 all have the same mean of 9
# Can also see that y1-y4 all have the same mean of 7.5 but 
# otherwise appear to have different summary stats

# look at relationship/association between each pair of 
# x and y variables: (x1,y1), (x2,y2), (x3,y3), (x4,y4)
# let's put all plots on same figure using the 
# par function
# see here on quick-r page: https://www.statmethods.net/advgraphs/layout.html
# command below will create
# 4 figures arranged in 2 rows and 2 columns
# 1st par command sets up 2 rows and 2 columns format
# 2nd par command resets it back to 1 row and 1 column

par(mfrow=c(2,2))
plot(dset1$x1,dset1$y1)
plot(dset1$x2,dset1$y2)
plot(dset1$x3,dset1$y4)
plot(dset1$x4,dset1$y4)

par(mfrow=c(1,1))


# HERE ARE SOME INTERESTING FACTS YOU MAY HAVE COME UP WITH
# (1) x1, x2, x3 are all identical
# (2) x4 has 10 identical values of 8 and the 11th value is 19
# (3) x1, x2, x3, x4 all have the same mean of 9
# (4) y1, y2, y3, y4 all have the same mean of 7.5
# (5) Relationship between each x,y pair is different
# (6) No missing data
# (7) 

## END PROGRAM ##