Topics for Today!

  1. Topic: Loading in data
  2. Topic: Managing data in dataframes
  3. Topic: Basic plotting in R - hist() and boxplot()
  4. Topic: Basic plotting in R- A first look at plot()

1. Topic: Loading Data

getwd() # this is the directory in which you can access files
## [1] "/Users/mike/Desktop/R Materials/Github Stuff"
setwd("~/Desktop/R Materials/mih140/Assignments/'20 Assignments/Data") # this is my directory where I save things.

# Set your directory to whereever you downloaded the RegionEx_Data.txt
# To load the data use the read.table() command
# ?read.table()
flight_data = read.table("RegionEx_Data.txt", sep = "\t")
flight_data = read.table("RegionEx_Data.txt", header = T, sep = "\t")
# Notice we used the sep = "\t" since the file is a tab separated
# If the file is a .csv, use sep = "," or read.csv(filename)

# To view the table try the View(df) command
View(flight_data)

# Note there are two airlines in the dataset
unique(flight_data$Airline) # this is the unique() function, it returns the unique elements in a vector
## [1] "RegionEx" "MDA"

2. Topic: Managing data from dataframes.

# Using names() we can get all column names of the flightdata dataset
names(flight_data)
##  [1] "Airline"                  "Origin.airport"          
##  [3] "Destination.airport"      "Departure.date"          
##  [5] "Scheduled.departure.time" "Scheduled.arrival.time"  
##  [7] "Actual.arrival.time"      "Arrival.delay.in.minutes"
##  [9] "Delay.indicator"          "Day.of.Week"             
## [11] "Route.Code"               "Number.of.passengers"
# Using class() we can get the type of the data
class(flight_data$Arrival.delay.in.minutes) #integer
## [1] "integer"
class(flight_data$Airline)                  #factor
## [1] "character"
# Note the last catagory is a "factor". Factors a special data types for catagorical variables. We can convert other character vectors to factors using the as.factor() method.

flight_data$Delay.indicator = as.factor(flight_data$Delay.indicator)
flight_data$Day.of.Week = as.factor(flight_data$Day.of.Week)

# Lets isolate all flight data for the two airlines

flight_data_MDA = flight_data[flight_data$Airline == "MDA",]
flight_data_RegionEx = flight_data[flight_data$Airline == "RegionEx",]


# We can use the table() function to examine 1 dimension data.
table(flight_data_RegionEx$Delay.indicator)
## 
##   0   1 
## 177  63
table(flight_data_RegionEx$Delay.indicator)/nrow(flight_data_RegionEx)
## 
##      0      1 
## 0.7375 0.2625

3. Topic: Basic plotting in R - hist() and boxplot()

# Lets reload our data file from before, and isolate the regionEx data.
setwd("~/Desktop/R Materials/mih140/Assignments/'20 Assignments/Data")
flight_data = read.table("RegionEx_Data.txt", header = T, sep = "\t")
flight_data_RegionEx = flight_data[flight_data$Airline == "RegionEx",]

# To explore the data lets try making a histogram of the delays
delays= flight_data_RegionEx$Arrival.delay.in.minutes

# To make a histogram in R we use the hist() function
# Example. 
hist(delays)

# This makes a very basic histogram. We customize using many of R features.
# We can change the title using the main = "" parameter
# Ex. 
hist(delays, main = "Delays in Minutes")

# Similarly we can change the x and y labels using parameters xlab = "" and ylab = ""
# Ex. 
hist(delays, main = "Delays in Minutes", xlab = "Arrival Delay (min)", ylab = "# of delays")

# Further we can manually set the bins using the parameter breaks = c(b1, b2, .., bk)
# Ex.
hist(delays, main = "Delays in Minutes", 
     xlab = "Arrival Delay (min)", ylab = "# of delays",
     breaks = seq(min(delays)-10,max(delays)+10,10))

# Notice I create bins using seq(min(delays)-10,max(delays)+10,10), run this code to see what it does. Why do I have the -10, +10 in the code?

# Similar to a histogram we can make a box plot using the command boxplot()

# Ex.
boxplot(delays)

# boxplot can be customized just like hist()

# Ex. 
boxplot(delays, main = "Delays in Minutes", 
     xlab = "Arrival Delay (min)", ylab = "# of delays")

4. Topic: Basic plotting in R - plot()

# Plotting continous varibles using plot()
delays= flight_data_RegionEx$Arrival.delay.in.minutes
num_of_passengers = flight_data_RegionEx$Number.of.passengers

plot(delays ~ num_of_passengers) # this plots number of pass on the x vs delays on the y

# We can title it similar to before
plot(delays ~ num_of_passengers, main = "Delay vs. Passenger Load", 
     xlab = "Number of Passengers", ylab = "Delay (min)")

# More on this next time!