Untidy_data

#For the purposes of this assignment I uploaded the dataset from MySQL. 
rm(list=ls())
library(tidyr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──

## ✔ ggplot2 2.2.1     ✔ purrr   0.2.4
## ✔ tibble  1.4.1     ✔ stringr 1.2.0
## ✔ readr   1.1.1     ✔ forcats 0.2.0

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

myurl <- "https://raw.githubusercontent.com/VioletaStoyanova/Untidy_data/master/flights.csv"
data<- read.csv(myurl, header = TRUE, stringsAsFactors = FALSE)
data

##   idflights_table Airline Schedule Los.Angeles Phoenix San.Diego
## 1               1  ALASKA  on_time         497     221       212
## 2               2    NULL delayed           62      12        20
## 3               3 AM WEST  on_time         694    4840       383
## 4               4    NULL delayed          117     415        65
##   San.Francisco Seattle
## 1           503    1841
## 2           102     305
## 3           320     201
## 4           129      61

names(data) <- c("ID","Airline","Schedule", "Los Angeles", "Phoenix", "San Diego",
                       "San Francisco", "Seattle")

colnames(data)

## [1] "ID"            "Airline"       "Schedule"      "Los Angeles"  
## [5] "Phoenix"       "San Diego"     "San Francisco" "Seattle"

#Then I created a data frame where I added Airline names in column 2 row 2 and column 2 row 4. 

data[2,2]<-c("ALASKA")
data[4,2]<-c("AM WEST")
data_df<-data.frame(data,stringsAsFactors=FALSE)
data_df[]<-lapply(data_df,as.array)
head(data_df)

##   ID Airline Schedule Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  1  ALASKA  on_time         497     221       212           503    1841
## 2  2  ALASKA delayed           62      12        20           102     305
## 3  3 AM WEST  on_time         694    4840       383           320     201
## 4  4 AM WEST delayed          117     415        65           129      61

#I attempted to group all the cities together and separated them by frequency.
flights<-data%>%
  gather(City,Freq,4:8) %>% select (-ID)

head(flights)

##   Airline Schedule        City Freq
## 1  ALASKA  on_time Los Angeles  497
## 2  ALASKA delayed  Los Angeles   62
## 3 AM WEST  on_time Los Angeles  694
## 4 AM WEST delayed  Los Angeles  117
## 5  ALASKA  on_time     Phoenix  221
## 6  ALASKA delayed      Phoenix   12

#At this step the two variables delayed and on time were separated into two columns.
temp <- flights%>%
  spread(Schedule,Freq)

str(temp$`delayed `) #this is the point where I realized that my delayed variable had a space and it was giving me trouble when I was trying to use the function mutate.

##  int [1:10] 62 12 20 102 305 117 415 65 129 61

mutate(temp, total_flight = `delayed `+on_time)

##    Airline          City delayed  on_time total_flight
## 1   ALASKA   Los Angeles       62     497          559
## 2   ALASKA       Phoenix       12     221          233
## 3   ALASKA     San Diego       20     212          232
## 4   ALASKA San Francisco      102     503          605
## 5   ALASKA       Seattle      305    1841         2146
## 6  AM WEST   Los Angeles      117     694          811
## 7  AM WEST       Phoenix      415    4840         5255
## 8  AM WEST     San Diego       65     383          448
## 9  AM WEST San Francisco      129     320          449
## 10 AM WEST       Seattle       61     201          262

colnames(temp)

## [1] "Airline"  "City"     "delayed " "on_time"

#I was suspiciuos and wanted to see the total number of flights comapring to the total percentage of delay between the two airlines. 
total_flights<- group_by(temp, Airline) %>% 
    summarise(total_delayed=sum(`delayed `), total_ontime=sum(on_time), total=sum(`delayed `+on_time))

head(total_flights)

## # A tibble: 2 x 4
##   Airline total_delayed total_ontime total
##   <chr>           <int>        <int> <int>
## 1 ALASKA            501         3274  3775
## 2 AM WEST           787         6438  7225

# And my suspicions were right as one can see that the total number of flights of AM WEST is 7225 and the total number of flights of ALASKA is 3775 it would be a mistake to conclude that AM WEST had more delays because they had more flights comapring to ALASKA.
sum_airline<- group_by(temp, Airline) %>% 
    summarise(avg_delay = mean(`delayed `), avg_on_time = mean(on_time)) %>% 
    mutate(percent_delay=100*(avg_delay/(avg_delay+avg_on_time))) 
sum_airline

## # A tibble: 2 x 4
##   Airline avg_delay avg_on_time percent_delay
##   <chr>       <dbl>       <dbl>         <dbl>
## 1 ALASKA        100         655          13.3
## 2 AM WEST       157        1288          10.9

Untidy_data

Violeta Stoyanova

3/3/2018