# Class: Foundations of Statistics Using R
# Title: in-class exercises
# Session: 3 (online meeting 2)
# Topic: R Packages and Scripts
# Last updated: 03/30/2017
# set working directory
setwd("~/Dropbox/R_Stat_Workshop/practice_exercises_in_progress")
# import data
data <- read.csv("winter_olympic.csv", sep=",", header=TRUE)
# open and look at data
#View(data)
# how many variables are in the data frame? (number of columns)
print(length(data))
## [1] 7
# what are the names of the columns?
names(data)
## [1] "Rank" "NOC" "Gold" "Silver" "Bronze" "Total" "Region"
# how many countries (rows) are in the data frame?
dim(data)
## [1] 26 7
#print(dim(data))
# print mean number of gold, silver and bronze medals
mean(data$Gold)
## [1] 3.807692
mean(data$Silver)
## [1] 3.730769
mean(data$Bronze)
## [1] 3.807692
# How many of each of the 3 types of medals were awarded?
sum(data$Gold)
## [1] 99
sum(data$Silver)
## [1] 97
sum(data$Bronze)
## [1] 99
# Print list of countries that did not win each of the 3 medals
# Print number of countries that did not win each of the 3 medals
# Hint: use subset
medal_subset = subset(x = data, subset = Gold == 0 | Silver == 0 | Bronze == 0)
medal_subset
## Rank NOC Gold Silver Bronze Total Region
## 8 8 Belarus (BLR) 5 0 1 6 EURASIA
## 20 20 Ukraine (UKR) 1 0 1 2 EURASIA
## 21 21 Slovakia (SVK) 1 0 0 1 EUROPE
## 22 22 Italy (ITA) 0 2 6 8 EUROPE
## 23 23 Latvia (LAT) 0 2 2 4 EURASIA
## 24 24 Australia (AUS) 0 2 1 3 AUSTRALIA
## 25 25 Croatia (CRO) 0 1 0 1 EUROPE
## 26 26 Kazakhstan (KAZ) 0 0 1 1 EURASIA
length(medal_subset)
## [1] 7
"
Exercises
We will be using basketball data for this exercise.
Source: http://www.ncaa.com/rankings/basketball-men/d1/ncaa-mens-basketball-rpi
(Note: using 2014 data )
Use script language to do the following:
(Remember to add comments)
1) set working directory -- Hint: setwd()
2) import the csv file: march_madness.csv
3) view the file
4) print number of rows and columns -- Hint: dim()
5) print columns names
6) change column names to lower case so it is easier to use
Hint: names(df_name) <- tolower(names(df_name))
7) explore the variable types. -- Hint: str()
8) how many different conferences are there?
9) Let’s look at the difference in values of first two columns:
a) compute a new vector called diff
b) calculate the difference in rank and previous
c) print count and list of schools that changed 3 or more places
Hint: create subset that satisfies criteria
"
## [1] "\nExercises\nWe will be using basketball data for this exercise.\n\nSource: http://www.ncaa.com/rankings/basketball-men/d1/ncaa-mens-basketball-rpi\n(Note: using 2014 data )\nUse script language to do the following:\n (Remember to add comments)\n\n1) set working directory -- Hint: setwd()\n2) import the csv file: march_madness.csv\n3) view the file\n4) print number of rows and columns -- Hint: dim() \n5) print columns names\n6) change column names to lower case so it is easier to use\nHint: names(df_name) <- tolower(names(df_name))\n7) explore the variable types. -- Hint: str()\n8) how many different conferences are there? \n9) Let’s look at the difference in values of first two columns:\n a) compute a new vector called diff \n b) calculate the difference in rank and previous\n c) print count and list of schools that changed 3 or more places\n Hint: create subset that satisfies criteria\n"
# import data
march_madness <- read.csv("march_madness.csv", sep=",", header=TRUE)
# check dimensions of df
length(march_madness)
## [1] 9
dim(march_madness)
## [1] 349 9
#View(march_madness)
# get column or variable names
names(march_madness)
## [1] "RANK" "PREVIOUS" "SCHOOL" "CONFERENCE" "RECORD"
## [6] "ROAD" "NEUTRAL" "HOME" "NON.DI"
# change upper case names to lower case
names(march_madness) <- tolower(names(march_madness))
# see if column names are now all lower case
names(march_madness)
## [1] "rank" "previous" "school" "conference" "record"
## [6] "road" "neutral" "home" "non.di"
# check variable types
str(march_madness)
## 'data.frame': 349 obs. of 9 variables:
## $ rank : int 1 2 3 4 5 6 7 8 9 10 ...
## $ previous : int 2 1 3 4 5 6 8 11 7 10 ...
## $ school : Factor w/ 349 levels "A&M-Corpus Christi",..: 87 11 131 340 326 343 124 327 72 61 ...
## $ conference: Factor w/ 33 levels "AAC","America East",..: 26 24 6 20 7 10 6 4 4 7 ...
## $ record : Factor w/ 123 levels "10-17","10-18",..: 100 98 80 101 93 87 87 95 88 87 ...
## $ road : Factor w/ 102 levels "0-10","0-11",..: 17 89 66 27 17 89 56 81 55 81 ...
## $ neutral : Factor w/ 34 levels "0- 0","0- 1",..: 26 23 24 26 18 23 33 30 29 20 ...
## $ home : Factor w/ 111 levels "0-12","1-10",..: 49 52 42 45 42 38 42 43 49 45 ...
## $ non.di : Factor w/ 10 levels "0-0","0-1","1-0",..: 1 1 1 3 1 1 1 1 1 1 ...
# compute difference in ranks and create a new column in dataframe
march_madness$diff <- march_madness$rank - march_madness$previous
# create subset where difference in rankings from previous is >= 3
mm_subset = subset(x = march_madness, subset = abs(diff) >= 3 )
dim(mm_subset)
## [1] 9 10
print(mm_subset)
## rank previous school conference record road neutral home
## 8 8 11 Virginia Atlantic Coast 28-6 7-4 6- 0 15-2
## 18 18 21 Michigan St. Big Ten 26-8 7-3 7- 1 12-4
## 32 32 37 Saint Joseph's Atlantic 10 24-9 8-4 5- 1 11-4
## 76 76 70 Georgia St. Sun Belt 25-8 10-6 2- 2 11-0
## 90 90 98 La.-Lafayette Sun Belt 23-11 6-9 4- 0 10-2
## 196 196 199 Coastal Caro. Big South 21-12 6-7 0- 0 12-5
## 198 198 201 Rider Metro Atlantic 14-17 6-8 2- 2 6-7
## 201 201 196 UNC Asheville Big South 17-15 5-10 1- 3 8-2
## 245 245 242 Mississippi St. Southeastern 14-19 0-10 3- 2 11-7
## non.di diff
## 8 0-0 -3
## 18 0-0 -3
## 32 0-0 -5
## 76 2-0 6
## 90 3-0 -8
## 196 3-0 -3
## 198 0-0 -3
## 201 3-0 5
## 245 0-0 3