#Exercise-1 #level-1 1. Create a vector of integers 1 to 10. Print the data from position 3 to 7.
x=x1.1=1:10
x1=x1.1[3:7]
x1
## [1] 3 4 5 6 7
for(i in 1:(length(x)/2)){
print(x[i*2])
}
## [1] 2
## [1] 4
## [1] 6
## [1] 8
## [1] 10
x2=subset(x1,x1%%2==0)
x2
## [1] 4 6
x3.1=1:20
x3=subset(x3.1,x3.1%%2==1)
x3
## [1] 1 3 5 7 9 11 13 15 17 19
x4=seq(1, 50, 1.5)
x4
## [1] 1.0 2.5 4.0 5.5 7.0 8.5 10.0 11.5 13.0 14.5 16.0 17.5 19.0 20.5 22.0
## [16] 23.5 25.0 26.5 28.0 29.5 31.0 32.5 34.0 35.5 37.0 38.5 40.0 41.5 43.0 44.5
## [31] 46.0 47.5 49.0
x5.1=1:5
x5.2=1:5
x5=c(x5.1,x5.2)#Duplicates are not removed
x5
## [1] 1 2 3 4 5 1 2 3 4 5
6.How to get the unique elements from above merged vector?
unique(x5)
## [1] 1 2 3 4 5
typeof(NA)
## [1] "logical"
seq(5)
## [1] 1 2 3 4 5
8.2. Repeat command: rep(c(1,4), 4) and rep(c(1,4), each=4)
rep(c(1,4), 4)
## [1] 1 4 1 4 1 4 1 4
rep(c(1,4), each=4)
## [1] 1 1 1 1 4 4 4 4
#level-2 #1.x = c(1:5) y = x x = c(3:7) 1. What is the output for all(x==y)?
x = c(1:5)
y = x
x = c(3:7)
all(x ==y)
## [1] FALSE
x = c(1:5)
y = x
x = c(3:7)
c(setdiff(x, y),setdiff(y, x))
## [1] 6 7 1 2
p = vector(length=6)
p[1] = 4
p
## [1] 4 0 0 0 0 0
typeof(c(1,2,3, NA))
## [1] "double"
typeof(c('1', '2', NA))
## [1] "character"
5.2. Explain the coercion: log_vec = c(TRUE, TRUE, F) sum(log_vec) Description of sum:sum returns the sum of all the values present in its arguments. Since R treats true and FALSE as 1 and 0 by default, the final sum is 0+1+1=2
log_vec = c(TRUE, TRUE, F)
sum(log_vec)
## [1] 2
#Exercise-2 #level-1 1.How is a list different from an atomic vector? Atomic vectors contain homogeneous elements. Lists may contain heterogeneous elements. 2. Guess the output: x = list(1:4, “a”, TRUE, list(c(‘hello’, ‘world!’))) y = c(1:4, “a”, TRUE, list(c(‘hello’, ‘world!’))) x == y
x = list(1:4, "a", TRUE, list(c('hello', 'world!')))
y = c(1:4, "a", TRUE, list(c('hello', 'world!')))
#x == y
3.Guess the output: typeof(a_list[2]) Is ‘list’, If we want to see the original type for an element in the list, write it as typeof(a_list[[2]]).
#level-2 1.Guess Output-1
something=list(name="PS",age=34)
something$na
## [1] "PS"
2.Guess Output-2 length() means the length of vectors (including lists),so we can find that there are only l1 and l2 two characters.
something=vector("list")
something$l1=c(1,2)
something$l2=1:4
typeof(something)
## [1] "list"
length(something)
## [1] 2
something
## $l1
## [1] 1 2
##
## $l2
## [1] 1 2 3 4
#Exercise-3 #level-1 1. How to convert a row vector of 1:10 to matrix mat shown in the figure ?
x = c(1:10)
m = matrix(x,nrow=2,ncol=5,byrow=TRUE)
m
## [,1] [,2] [,3] [,4] [,5]
## [1,] 1 2 3 4 5
## [2,] 6 7 8 9 10
x = list(1:10)
e = unlist(x)
names(e)<-NULL
m=matrix(e,nrow=2,byrow=TRUE)
m
## [,1] [,2] [,3] [,4] [,5]
## [1,] 1 2 3 4 5
## [2,] 6 7 8 9 10
#level-2 1. In matrix mat from question 2; change all elements greater than 3 to 0.
m1=m
m1[m1>3]<-0
m1
## [,1] [,2] [,3] [,4] [,5]
## [1,] 1 2 3 0 0
## [2,] 0 0 0 0 0
m2=m*2
m2
## [,1] [,2] [,3] [,4] [,5]
## [1,] 2 4 6 8 10
## [2,] 12 14 16 18 20
m3=m
dim(m3) = c(1*10,1)
m3
## [,1]
## [1,] 1
## [2,] 6
## [3,] 2
## [4,] 7
## [5,] 3
## [6,] 8
## [7,] 4
## [8,] 9
## [9,] 5
## [10,] 10
#Exercise-4 #level-1 1. Learn the utility of the following functions by executingthem on dataframe: class_modified summary(); head(); tail(); as.matrix()
student_id=101:105
age= sample(15:20, replace=TRUE, size=5)
gender=sample(c("M","F"),5,replace = TRUE,prob = c(0.6,0.4))
class_data=data.frame(student_id,gender,age,stringsAsFactors = FALSE)
str(class_data)
## 'data.frame': 5 obs. of 3 variables:
## $ student_id: int 101 102 103 104 105
## $ gender : chr "M" "F" "F" "M" ...
## $ age : int 19 20 20 16 17
new_student=list(106,'F',18)
class_data=rbind(class_data,new_student)
class_data$discount=ifelse(class_data$age<16,0.10,0.05)
height=sample(150:180,replace = TRUE,size=6)
class_additional_info=data.frame(student_id=101:106,nationality=sample(c("Asian","European","Japanese"),size = 6,replace = TRUE),stringsAsFactors = F)
class_modified=merge(class_data,class_additional_info,by="student_id")
class_modified
## student_id gender age discount nationality
## 1 101 M 19 0.05 European
## 2 102 F 20 0.05 Asian
## 3 103 F 20 0.05 European
## 4 104 M 16 0.05 European
## 5 105 F 17 0.05 Japanese
## 6 106 F 18 0.05 Japanese
summary(class_modified)
## student_id gender age discount
## Min. :101.0 Length:6 Min. :16.00 Min. :0.05
## 1st Qu.:102.2 Class :character 1st Qu.:17.25 1st Qu.:0.05
## Median :103.5 Mode :character Median :18.50 Median :0.05
## Mean :103.5 Mean :18.33 Mean :0.05
## 3rd Qu.:104.8 3rd Qu.:19.75 3rd Qu.:0.05
## Max. :106.0 Max. :20.00 Max. :0.05
## nationality
## Length:6
## Class :character
## Mode :character
##
##
##
head(class_modified)
## student_id gender age discount nationality
## 1 101 M 19 0.05 European
## 2 102 F 20 0.05 Asian
## 3 103 F 20 0.05 European
## 4 104 M 16 0.05 European
## 5 105 F 17 0.05 Japanese
## 6 106 F 18 0.05 Japanese
tail(class_modified)
## student_id gender age discount nationality
## 1 101 M 19 0.05 European
## 2 102 F 20 0.05 Asian
## 3 103 F 20 0.05 European
## 4 104 M 16 0.05 European
## 5 105 F 17 0.05 Japanese
## 6 106 F 18 0.05 Japanese
as.matrix(class_modified)
## student_id gender age discount nationality
## [1,] "101" "M" "19" "0.05" "European"
## [2,] "102" "F" "20" "0.05" "Asian"
## [3,] "103" "F" "20" "0.05" "European"
## [4,] "104" "M" "16" "0.05" "European"
## [5,] "105" "F" "17" "0.05" "Japanese"
## [6,] "106" "F" "18" "0.05" "Japanese"
# student_id as row.names
row.names(class_modified) = class_modified$student_id
class_modified$student_id = NULL
attributes(class_modified)
## $names
## [1] "gender" "age" "discount" "nationality"
##
## $row.names
## [1] "101" "102" "103" "104" "105" "106"
##
## $class
## [1] "data.frame"
as.matrix(class_modified)
## gender age discount nationality
## 101 "M" "19" "0.05" "European"
## 102 "F" "20" "0.05" "Asian"
## 103 "F" "20" "0.05" "European"
## 104 "M" "16" "0.05" "European"
## 105 "F" "17" "0.05" "Japanese"
## 106 "F" "18" "0.05" "Japanese"
#Remove student_id from the columns later
class_modified[-1,]
## gender age discount nationality
## 102 F 20 0.05 Asian
## 103 F 20 0.05 European
## 104 M 16 0.05 European
## 105 F 17 0.05 Japanese
## 106 F 18 0.05 Japanese
class_modified[c(1,2,4), c(1,3)]
## gender discount
## 101 M 0.05
## 102 F 0.05
## 104 M 0.05
table(class_modified$gender)
##
## F M
## 4 2
weight = sample(60:80, replace = TRUE, size = 6)
class_modified$weight = weight
class_modified
## gender age discount nationality weight
## 101 M 19 0.05 European 76
## 102 F 20 0.05 Asian 75
## 103 F 20 0.05 European 67
## 104 M 16 0.05 European 79
## 105 F 17 0.05 Japanese 77
## 106 F 18 0.05 Japanese 73
question_6=data.frame(name=c("Tom","Bob","Merv"),'2010'=c("married","single","single"),'2012'=c("father","married","single"))
question_6
## name X2010 X2012
## 1 Tom married father
## 2 Bob single married
## 3 Merv single single
#level-2 1. Create a BMI column say ‘bmi’ in class_modified. Use relevant columns from class_modified.
# Add a column; call it 'height'
height = sample(150:180, replace = TRUE, size = 6)
class_modified$height = height # can also use cbind()
class_modified
## gender age discount nationality weight height
## 101 M 19 0.05 European 76 166
## 102 F 20 0.05 Asian 75 154
## 103 F 20 0.05 European 67 170
## 104 M 16 0.05 European 79 169
## 105 F 17 0.05 Japanese 77 151
## 106 F 18 0.05 Japanese 73 180
class_modified$bmi = (class_modified$weight/ (class_modified$height)^2)*10000
# Conversion between centimeter and meter units
class_modified
## gender age discount nationality weight height bmi
## 101 M 19 0.05 European 76 166 27.58020
## 102 F 20 0.05 Asian 75 154 31.62422
## 103 F 20 0.05 European 67 170 23.18339
## 104 M 16 0.05 European 79 169 27.66010
## 105 F 17 0.05 Japanese 77 151 33.77045
## 106 F 18 0.05 Japanese 73 180 22.53086
names(class_modified)
## [1] "gender" "age" "discount" "nationality" "weight"
## [6] "height" "bmi"
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
class_modified %>%
filter(nationality == 'European')
## gender age discount nationality weight height bmi
## 101 M 19 0.05 European 76 166 27.58020
## 103 F 20 0.05 European 67 170 23.18339
## 104 M 16 0.05 European 79 169 27.66010
2.2. all students with age more than 16 yrs. but less than 20 yrs.
class_modified %>%
filter(age > 16 & age < 20)
## gender age discount nationality weight height bmi
## 101 M 19 0.05 European 76 166 27.58020
## 105 F 17 0.05 Japanese 77 151 33.77045
## 106 F 18 0.05 Japanese 73 180 22.53086
Compute mean age of males and females separately from class_modified. [Hint: any grouping function?]
Create a data in a data frame with the following column names and requirements as given on next slide. Requirements: emp.id : Employee id; values in range 1 to 9. dep: 3 Employee departments- ‘front-desk’, ‘back-office’, ‘Reception’. Each repeated 3 times in data. bonus.prev: last year’s bonus; value range 2000 to 5000. More than one employee may receive same bonus. bonus.cur: Current year bonus with similar restriction as point 3. incr.prev: Percentage increment received last year; value range 0.00 to 0.05 from random uniform distribution. More than one employee may receive same percentage increment. incr.cur: Current year increment with similar restrictions as point 5. avg.bonus: 2 year average bonus for each employee. avg.incr: 2 year average increment for each employee.
#Exercise-5 #level-1 1. Write code to read ‘airline_delay_causes_data.csv’. Retail data limiting to columns: 1) that contain counts [Hint: columns that end with ‘*_ct’] , and 2) year, month, carrier, and airport. 1.1. How many columns are there in final data frame?
library(dplyr)
library(readxl)
library(Rcpp)
airline_xl = read_excel("/Users/ruiqianli/Desktop/EBAC_SB/Day_1/data/airline_delay_causes_data.xlsx")
Sorry, I have been trying for a long time to import this problem into Excel successfully…
1.2. Compared to reading full data is it better in terms of time taken or memory occupied or both?
#level-2 1. Read the following 2 files and join them to create a single dataframe. Retain all the records of the first sheet. • Workbook: injuries_2017.xlsx • Sheet-1: injuries_2017_P1 • Sheet-2: injuries_2017_descriptions Which common key can be used to join these 2 files? What kind of join did you use? What is the dimension of the final data frame?
#Exercise-6 #level-1 1. Convert the given vector of sizes to an ordered factor, say size_ordered, with levels: S < L < XL . Where X stands for extra. size_vec = c(“small”, “small”, “large”, “small”, “large”)
size_vec = c("small","small","large","small","large")
size_ordered = factor(size_vec,levels = c("small","large","extra large"),labels = c("S","L","XL"),ordered = T)
size_ordered
## [1] S S L S L
## Levels: S < L < XL
size_ordered=factor(size_ordered)
size_ordered
## [1] S S L S L
## Levels: S < L
ordered(size_ordered,levels=c("XL","L","S"))
## [1] S S L S L
## Levels: XL < L < S
#size_ordered[3] = "extra small“
#print(size_ordered)
#Exercise-7 #level-1 1. What is the output? dat = data.frame(x=c(1,2), y=c(3,4), z=c(5,6)) apply(dat[,c(‘x’,‘z’)], 1, function(x) sum(x) )
dat = data.frame(x=c(1,2), y=c(3,4), z=c(5,6))
apply(dat[,c('x','z')], 1, function(x) sum(x) )
## [1] 6 8
#Exercise-8 #level-1 1.Assign ‘flights’ data from package ‘nycflights13’ to a data frame and call it flights_data
library(nycflights13)
flights_data=data.frame(flights)
2.Explore the dataset for its structure and summary stats. Answer the following: 1)What is the dimension of this dataset?
dim(flights_data)
## [1] 336776 19
head(flights_data)
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## arr_delay carrier flight tailnum origin dest air_time distance hour minute
## 1 11 UA 1545 N14228 EWR IAH 227 1400 5 15
## 2 20 UA 1714 N24211 LGA IAH 227 1416 5 29
## 3 33 AA 1141 N619AA JFK MIA 160 1089 5 40
## 4 -18 B6 725 N804JB JFK BQN 183 1576 5 45
## 5 -25 DL 461 N668DN LGA ATL 116 762 6 0
## 6 12 UA 1696 N39463 EWR ORD 150 719 5 58
## time_hour
## 1 2013-01-01 05:00:00
## 2 2013-01-01 05:00:00
## 3 2013-01-01 05:00:00
## 4 2013-01-01 05:00:00
## 5 2013-01-01 06:00:00
## 6 2013-01-01 05:00:00
2)Which colums would you rather assign as factors? “carrier origin” and “dest”, they are more convenient for classification. 3)What is the percentage of values missing per column?
colMeans(is.na(flights_data))
## year month day dep_time sched_dep_time
## 0.000000000 0.000000000 0.000000000 0.024511842 0.000000000
## dep_delay arr_time sched_arr_time arr_delay carrier
## 0.024511842 0.025871796 0.000000000 0.028000808 0.000000000
## flight tailnum origin dest air_time
## 0.000000000 0.007458964 0.000000000 0.000000000 0.028000808
## distance hour minute time_hour
## 0.000000000 0.000000000 0.000000000 0.000000000
4)How can you create a proper data column in dataframe flights_data?
flights_data$delay_or_not = ifelse(flights_data$dep_delay > 0, 'delay', 'not_delay')
head(flights_data)
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## arr_delay carrier flight tailnum origin dest air_time distance hour minute
## 1 11 UA 1545 N14228 EWR IAH 227 1400 5 15
## 2 20 UA 1714 N24211 LGA IAH 227 1416 5 29
## 3 33 AA 1141 N619AA JFK MIA 160 1089 5 40
## 4 -18 B6 725 N804JB JFK BQN 183 1576 5 45
## 5 -25 DL 461 N668DN LGA ATL 116 762 6 0
## 6 12 UA 1696 N39463 EWR ORD 150 719 5 58
## time_hour delay_or_not
## 1 2013-01-01 05:00:00 delay
## 2 2013-01-01 05:00:00 delay
## 3 2013-01-01 05:00:00 delay
## 4 2013-01-01 05:00:00 not_delay
## 5 2013-01-01 06:00:00 not_delay
## 6 2013-01-01 05:00:00 not_delay
3.More Questions 5)How many unique origins and unique destinations do we have?
length(unique(flights_data$origin))
## [1] 3
length(unique(flights_data$dest))
## [1] 105
6)Are all origins connected to other destinations?
sum(ifelse(flights_data$origin==flights_data$dest,1,0))
## [1] 0
Yes, all the starting points are connected to other destinations. 7)Delay>15 mins is an important KPI in aviation. What proportion of flights are delayed more than 15 mins?
flights_data$delay15mins_or_not = ifelse(flights_data$dep_delay>15, 1, 0)
summary(flights_data$delay15mins_or_not)#mean=0.215=proportion of flights are delayed more than 15 mins
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 0.000 0.000 0.215 0.000 1.000 8255
mean = 0.215 = proportion of flights are delayed more than 15 mins