# Class:        Foundations of Statistics Using R
# Title:        in-class exercises
# Session:      3 (online meeting 2)
# Topic:        R Packages and Scripts
# Last updated: 4/3/2016
# get current working directory
getwd() 
## [1] "/Users/ksosulsk/Dropbox/R_Stat_Workshop/practice_exercises_in_progress"
# set working directory
setwd("~/Dropbox/R_Stat_Workshop/practice_exercises_in_progress")

# import data
data <- read.csv("winter_olympic.csv", sep=",", header=TRUE)

# open and look at data
#View(data)

# how many variables are in the data frame? (number of columns)
print(length(data))
## [1] 7
# what are the names of the columns?
names(data)
## [1] "Rank"   "NOC"    "Gold"   "Silver" "Bronze" "Total"  "Region"
# how many countries (rows) are in the data frame?
dim(data)
## [1] 26  7
#print(dim(data))

# print mean number of gold, silver and bronze medals

mean(data$Gold)
## [1] 3.807692
mean(data$Silver)
## [1] 3.730769
mean(data$Bronze)
## [1] 3.807692
# How many of each of the 3 types of medals were awarded?
sum(data$Gold)
## [1] 99
sum(data$Silver)
## [1] 97
sum(data$Bronze)
## [1] 99
# Print list of countries that did not win each of the 3 medals
# Print number of countries that did not win each of the 3 medals
# Hint:  use subset

medal_subset = subset(x = data, subset = Gold == 0 | Silver == 0 | Bronze == 0)
medal_subset
##    Rank               NOC Gold Silver Bronze Total    Region
## 8     8     Belarus (BLR)    5      0      1     6   EURASIA
## 20   20     Ukraine (UKR)    1      0      1     2   EURASIA
## 21   21    Slovakia (SVK)    1      0      0     1    EUROPE
## 22   22       Italy (ITA)    0      2      6     8    EUROPE
## 23   23      Latvia (LAT)    0      2      2     4   EURASIA
## 24   24   Australia (AUS)    0      2      1     3 AUSTRALIA
## 25   25     Croatia (CRO)    0      1      0     1    EUROPE
## 26   26  Kazakhstan (KAZ)    0      0      1     1   EURASIA
length(medal_subset)
## [1] 7
"
Exercises
We will be using basketball data for this exercise.

Source:  http://www.ncaa.com/rankings/basketball-men/d1/ncaa-mens-basketball-rpi
(Note: using 2014 data )
Use script language to do the following:
  (Remember to add comments)

1)  set working directory -- Hint:  setwd()
2)  import the csv file:  march_madness.csv
3)  view the file
4)  print number of rows and columns --  Hint:  dim() 
5)  print columns names
6)  change column names to lower case so it is easier to use
Hint:  names(df_name) <- tolower(names(df_name))
7)  explore the variable types.  -- Hint:  str()
8)  how many different conferences are there?  
9)  Let’s look at the difference in values of first two columns:
  a)  compute a new vector called diff  
  b)  calculate the difference in rank and previous
  c)  print count and list of schools that changed 3 or more places
    Hint:  create subset that satisfies criteria
"
## [1] "\nExercises\nWe will be using basketball data for this exercise.\n\nSource:  http://www.ncaa.com/rankings/basketball-men/d1/ncaa-mens-basketball-rpi\n(Note: using 2014 data )\nUse script language to do the following:\n  (Remember to add comments)\n\n1)  set working directory -- Hint:  setwd()\n2)  import the csv file:  march_madness.csv\n3)  view the file\n4)  print number of rows and columns --  Hint:  dim() \n5)  print columns names\n6)  change column names to lower case so it is easier to use\nHint:  names(df_name) <- tolower(names(df_name))\n7)  explore the variable types.  -- Hint:  str()\n8)  how many different conferences are there?  \n9)  Let’s look at the difference in values of first two columns:\n  a)  compute a new vector called diff  \n  b)  calculate the difference in rank and previous\n  c)  print count and list of schools that changed 3 or more places\n    Hint:  create subset that satisfies criteria\n"
#set working directory
setwd("~/Dropbox/R_Stat_Workshop/practice_exercises_in_progress")

# import data

march <- read.csv("march_madness.csv", sep=",", header=TRUE)

# check dimensions of df
length(march)
## [1] 9
dim(march)
## [1] 349   9
#View(march_madness)

# get column or variable names

names(march)
## [1] "RANK"       "PREVIOUS"   "SCHOOL"     "CONFERENCE" "RECORD"    
## [6] "ROAD"       "NEUTRAL"    "HOME"       "NON.DI"
# change upper case names to lower case
names(march) <- tolower(names(march))


# see if column names are now all lower case
names(march)
## [1] "rank"       "previous"   "school"     "conference" "record"    
## [6] "road"       "neutral"    "home"       "non.di"
# check variable types
str(march)
## 'data.frame':    349 obs. of  9 variables:
##  $ rank      : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ previous  : int  2 1 3 4 5 6 8 11 7 10 ...
##  $ school    : Factor w/ 349 levels "A&M-Corpus Christi",..: 87 11 131 340 326 343 124 327 72 61 ...
##  $ conference: Factor w/ 33 levels "AAC","America East",..: 26 24 6 20 7 10 6 4 4 7 ...
##  $ record    : Factor w/ 123 levels "10-17","10-18",..: 100 98 80 101 93 87 87 95 88 87 ...
##  $ road      : Factor w/ 102 levels "0-10","0-11",..: 17 89 66 27 17 89 56 81 55 81 ...
##  $ neutral   : Factor w/ 34 levels "0- 0","0- 1",..: 26 23 24 26 18 23 33 30 29 20 ...
##  $ home      : Factor w/ 111 levels "0-12","1-10",..: 49 52 42 45 42 38 42 43 49 45 ...
##  $ non.di    : Factor w/ 10 levels "0-0","0-1","1-0",..: 1 1 1 3 1 1 1 1 1 1 ...
# compute difference in ranks and create a new column in dataframe
march$diff <- march$rank - march$previous

# create subset where difference in rankings from previous is >= 3
mm_subset = subset(x = march, subset = abs(diff) >= 3 )
dim(mm_subset)
## [1]  9 10
mm_subset
##     rank previous          school     conference record road neutral home
## 8      8       11        Virginia Atlantic Coast   28-6  7-4    6- 0 15-2
## 18    18       21    Michigan St.        Big Ten   26-8  7-3    7- 1 12-4
## 32    32       37  Saint Joseph's    Atlantic 10   24-9  8-4    5- 1 11-4
## 76    76       70     Georgia St.       Sun Belt   25-8 10-6    2- 2 11-0
## 90    90       98   La.-Lafayette       Sun Belt  23-11  6-9    4- 0 10-2
## 196  196      199   Coastal Caro.      Big South  21-12  6-7    0- 0 12-5
## 198  198      201           Rider Metro Atlantic  14-17  6-8    2- 2  6-7
## 201  201      196   UNC Asheville      Big South  17-15 5-10    1- 3  8-2
## 245  245      242 Mississippi St.   Southeastern  14-19 0-10    3- 2 11-7
##     non.di diff
## 8      0-0   -3
## 18     0-0   -3
## 32     0-0   -5
## 76     2-0    6
## 90     3-0   -8
## 196    3-0   -3
## 198    0-0   -3
## 201    3-0    5
## 245    0-0    3