- The RStudio environment
- Basic operators
- Basic data types
- Define variables
variable_name <- some_value
target <- data
variable_name <- some_value
target <- data
A vector is a sequence of values, all of the same type
x <- c(1, 3, 7, 15) # c stands for "combine" x
## [1] 1 3 7 15
is.vector(x)
## [1] TRUE
length(x) # find the number of elements in a vector
## [1] 4
seq(from=1, to=10) # sequence
## [1] 1 2 3 4 5 6 7 8 9 10
1:10 # sequence shorthand
## [1] 1 2 3 4 5 6 7 8 9 10
seq(from=1, to=10, by=2) # sequence
## [1] 1 3 5 7 9
rep(7, times=3) # repeat
## [1] 7 7 7
vec <- c(10, 20, 7, 13) # assigning a vector to a variable vec
## [1] 10 20 7 13
names(vec) <- c("name1", "name2", "name3", "name4") vec
## name1 name2 name3 name4 ## 10 20 7 13
vec <- c("name2"=10, "name2"=20, "name3"=7, "name4"=13) #same result vec <- c(name1=10, name1=20, name1=7, name1=13) #same result
vec1 <- c(1, 3, 5) vec2 <- c(11, 13, 15) c(vec1, vec2, c(21, 23, 25))
## [1] 1 3 5 11 13 15 21 23 25
Try to find the most efficient way (fewer characters) to create the following vectors:
## [1] 10 11 12 13 14 15 16 17 18 19 20 ## [1] -30 -25 -20 -15 -10 -5 0 5 10 15 20 25 30 ## apple banana cherry ## 4 5 3
Vector computations are performed element-wise
earnings <- c(10, 20, 30, 40) expenses <- c(5, 25, 25, 10) 5 * earnings
## [1] 50 100 150 200
earnings - expenses
## [1] 5 -5 5 30
earnings * c(1, 2, 3, 4)
## [1] 10 40 90 160
x <- c(10, 20, 30)
Is there a value of 20 in x
?
20 %in% x
## [1] TRUE
Is there a value of 40 in x
?
40 %in% x
## [1] FALSE
basket <- c("apple", "banana", "cherry")
Does basket
have apple?
"apple" %in% basket
## [1] TRUE
Does basket
have cheese?
"cheese" %in% basket
## [1] FALSE
In real world, your data may contain missing values. In R, we use NA
(upper case) to represent a missing value.
vec = c(1, 4, NA, 2) vec
## [1] 1 4 NA 2
But NA
creates problems for most numerical functions.
For example, we cannot add NA
to other numbers.
sum(vec)
## [1] NA
max(vec)
## [1] NA
To apply these numerical functions on data with NA
s, we simply just remove NA
s from the calculation. That is,
sum(vec, na.rm = T) # remove NAs before calculating the sum
## [1] 7
max(vec, na.rm = T) # remove NAs before getting the max value
## [1] 4
Recycling repeats elements in the shorter vector until its length matches the longer vector
u <- c(10, 20) v <- c(1, 2, 3, 4, 5) u + v # the shorter vector will be recycled to match the longer vector
## [1] 11 22 13 24 15
Under the hood:
u + v
= c(10, 20) + c(1, 2, 3, 4, 5)
= c(10, 20, 10, 20, 10) + c(1, 2, 3, 4, 5) # recycling
= c(10+1, 20+2, 10+3, 20+4, 10+5) # element-wise operation
= c(11, 22, 13, 24, 15)
Without typing the following into R, guess what the result of the last line would be:
vec1 <- 1:4 vec2 <- c(1, 0, 0,0) vec1 - vec2
Another more challenging one:
seq(from=0, to=10, by=2) + 3:1
You can retrieve elements from a vector by specifying the indexes of the elements. This operation is also known as subsetting
.
vec <- c("value1"=10, "value2"=20, "value3"=30, "value4"=40) vec[1] # get the element at index 1
## value1 ## 10
vec["value3"] # get the element whose name matches the string
## value3 ## 30
You can provide more than just one index.
vec[1:3] # specify a vector of indexes
## value1 value2 value3 ## 10 20 30
vec[c(3, 2, 1, 4)] # return with the specified order
## value3 value2 value1 value4 ## 30 20 10 40
vec[c("value4", "value4")]
## value4 value4 ## 40 40
vec <- c("value1"=10, "value2"=20, "value3"=30, "value4"=40) vec[-1] # all but the first element
## value2 value3 value4 ## 20 30 40
vec[-c(1, 2)] # all but the first and the second elements
## value3 value4 ## 30 40
List is also a container for values, but can accommodate items of different data types.
x <- list("Bob", c(100,80,90)) x
## [[1]] ## [1] "Bob" ## ## [[2]] ## [1] 100 80 90
Just like vectors, you can give each element a name:
x <- list(name="Bob", grades=c(100,80,90)) x
## $name ## [1] "Bob" ## ## $grades ## [1] 100 80 90
x[2] # get the second elment as a list
## $grades ## [1] 100 80 90
x["grades"] # get the elment named "grades" as a list
## $grades ## [1] 100 80 90
y1 = x[2] class(y1)
## [1] "list"
x[[2]] # get the second elment as a vector
## [1] 100 80 90
y2 = x[[2]] class(y2)
## [1] "numeric"
x[["grades"]] # get the elment named "grades" as a vector
## [1] 100 80 90
x$grades # most common/readable way to retrieve an elment by name
## [1] 100 80 90
## $name ## [1] "Anna" ## ## $is_female ## [1] TRUE ## ## $age ## [1] 22 ## ## $enrollment ## [1] "CIS4710" "CIS4720" "CIS4730" ## ## $grade_point ## [1] 4 3 4
A matrix is a collection of data elements arranged in a two-dimensional rectangular layout.
A <- matrix( 1:6, # the data elements nrow=2, # number of rows ncol=3, # number of columns byrow = TRUE) # fill matrix by rows A
## [,1] [,2] [,3] ## [1,] 1 2 3 ## [2,] 4 5 6
A data frame is a set of vectors of equal length. Consider data frame as an Excel sheet or a database table.
Column names are preserved or guessed if not explicitly set
course <- c("CIS4730", "CIS4710", "CIS4950", "CIS1234") num_of_students <- c(20, 10, 40, 30) data_analytics_minor <- c(TRUE, TRUE, TRUE, FALSE) df <- data.frame(course, n_students=num_of_students, data_analytics_minor, stringsAsFactors=F) df # notice the column names and row names
## course n_students data_analytics_minor ## 1 CIS4730 20 TRUE ## 2 CIS4710 10 TRUE ## 3 CIS4950 40 TRUE ## 4 CIS1234 30 FALSE
ncol(df) # number of columns
## [1] 3
nrow(df) # number of rows
## [1] 4
colnames(df) # get column names
## [1] "course" "n_students" "data_analytics_minor"
rownames(df) # get row names
## [1] "1" "2" "3" "4"
You can change column and row names:
df2 <- df # create a copy of df, and name it as "df2" colnames(df2) <- c("col1", "col2", "col3") # assign column names colnames(df2) # they were "course", "n_students", "ba_minor_course"
## [1] "col1" "col2" "col3"
rownames(df2) <- c("row1", "row2", "row3", "row4") # assign row names rownames(df2) # they were "1", "2", "3", "4"
## [1] "row1" "row2" "row3" "row4"
There are many ways you can get values out of a column:
dataframe_name$column_name
df$course
## [1] "CIS4730" "CIS4710" "CIS4950" "CIS1234"
df$n_students
## [1] 20 10 40 30
df
## course n_students data_analytics_minor ## 1 CIS4730 20 TRUE ## 2 CIS4710 10 TRUE ## 3 CIS4950 40 TRUE ## 4 CIS1234 30 FALSE
All of a row
df[2,] # row 2
## course n_students data_analytics_minor ## 2 CIS4710 10 TRUE
df
## course n_students data_analytics_minor ## 1 CIS4730 20 TRUE ## 2 CIS4710 10 TRUE ## 3 CIS4950 40 TRUE ## 4 CIS1234 30 FALSE
Multiple rows
df[c(1,3),] # rows 1 & 3
## course n_students data_analytics_minor ## 1 CIS4730 20 TRUE ## 3 CIS4950 40 TRUE
df[2,1] # row 2, column 1
## [1] "CIS4710"
df[c(3,4),c(1,2)] # rows 3 & 4, columns 1 & 2
## course n_students ## 3 CIS4950 40 ## 4 CIS1234 30
df["2","n_students"] # "2": row name, "n_students": column name
## [1] 10
Rows matching a condition:
df
## course n_students data_analytics_minor ## 1 CIS4730 20 TRUE ## 2 CIS4710 10 TRUE ## 3 CIS4950 40 TRUE ## 4 CIS1234 30 FALSE
# Be careful that it's df$course in the brackets, not just course df[df$course == 'CIS4730', ]
## course n_students data_analytics_minor ## 1 CIS4730 20 TRUE
df
## course n_students data_analytics_minor ## 1 CIS4730 20 TRUE ## 2 CIS4710 10 TRUE ## 3 CIS4950 40 TRUE ## 4 CIS1234 30 FALSE
df$course == 'CIS4730'
## [1] TRUE FALSE FALSE FALSE
df[df$course == 'CIS4730', ] # is interpreted by R as the following df[c(TRUE, FALSE, FALSE, FALSE), ]
The result is that you are getting the TRUE
row(s).
library(xml2) library(XML)
install.packages("tidyverse") library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4 ## v tibble 3.1.6 v dplyr 1.0.7 ## v tidyr 1.1.4 v stringr 1.4.0 ## v readr 2.1.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() -- ## x dplyr::filter() masks stats::filter() ## x dplyr::lag() masks stats::lag()
# Read the xml file menu_data <- read_xml('https://www.w3schools.com/xml/simple.xml') # display menu_data class(menu_data)
## [1] "xml_document" "xml_node"
menu_data
## {xml_document} ## <breakfast_menu> ## [1] <food>\n <name>Belgian Waffles</name>\n <price>$5.95</price>\n <descri ... ## [2] <food>\n <name>Strawberry Belgian Waffles</name>\n <price>$7.95</price> ... ## [3] <food>\n <name>Berry-Berry Belgian Waffles</name>\n <price>$8.95</price ... ## [4] <food>\n <name>French Toast</name>\n <price>$4.50</price>\n <descripti ... ## [5] <food>\n <name>Homestyle Breakfast</name>\n <price>$6.95</price>\n <de ...
# Parse the food_data into R structure representing XML tree menu_xml <- xmlParse(menu_data) # Display the XML tree menu_xml
## <?xml version="1.0" encoding="UTF-8"?> ## <breakfast_menu> ## <food> ## <name>Belgian Waffles</name> ## <price>$5.95</price> ## <description>Two of our famous Belgian Waffles with plenty of real maple syrup</description> ## <calories>650</calories> ## </food> ## <food> ## <name>Strawberry Belgian Waffles</name> ## <price>$7.95</price> ## <description>Light Belgian waffles covered with strawberries and whipped cream</description> ## <calories>900</calories> ## </food> ## <food> ## <name>Berry-Berry Belgian Waffles</name> ## <price>$8.95</price> ## <description>Light Belgian waffles covered with an assortment of fresh berries and whipped cream</description> ## <calories>900</calories> ## </food> ## <food> ## <name>French Toast</name> ## <price>$4.50</price> ## <description>Thick slices made from our homemade sourdough bread</description> ## <calories>600</calories> ## </food> ## <food> ## <name>Homestyle Breakfast</name> ## <price>$6.95</price> ## <description>Two eggs, bacon or sausage, toast, and our ever-popular hash browns</description> ## <calories>950</calories> ## </food> ## </breakfast_menu> ##
# Convert the parsed XML to a dataframe df_menu <- xmlToDataFrame(nodes=getNodeSet(menu_xml, "//food")) class(df_menu)
## [1] "data.frame"
#View(df_menu)
# Extract XML data using xpath menu <- xml_find_all(menu_data, xpath="/breakfast_menu/food") print(xml_text(menu))
## [1] "Belgian Waffles$5.95Two of our famous Belgian Waffles with plenty of real maple syrup650" ## [2] "Strawberry Belgian Waffles$7.95Light Belgian waffles covered with strawberries and whipped cream900" ## [3] "Berry-Berry Belgian Waffles$8.95Light Belgian waffles covered with an assortment of fresh berries and whipped cream900" ## [4] "French Toast$4.50Thick slices made from our homemade sourdough bread600" ## [5] "Homestyle Breakfast$6.95Two eggs, bacon or sausage, toast, and our ever-popular hash browns950"
breakfast_name <- xml_find_all(menu_data, xpath="//name") %>% xml_text print(breakfast_name)
## [1] "Belgian Waffles" "Strawberry Belgian Waffles" ## [3] "Berry-Berry Belgian Waffles" "French Toast" ## [5] "Homestyle Breakfast"
breakfast_price <- xml_find_all(menu_data, xpath="//price") %>% xml_text print(breakfast_price)
## [1] "$5.95" "$7.95" "$8.95" "$4.50" "$6.95"
names(breakfast_price) <-breakfast_name # Recall names(vec) <- c("name1", "name2", "name3", "name4") breakfast_price
## Belgian Waffles Strawberry Belgian Waffles ## "$5.95" "$7.95" ## Berry-Berry Belgian Waffles French Toast ## "$8.95" "$4.50" ## Homestyle Breakfast ## "$6.95"
install.packages("tidyverse") install.packages("jsonlite") library(tidyverse) library(jsonlite)
url_json <- "https://mdn.github.io/learning-area/javascript/oojs/json/superheroes.json" superheros <- jsonlite::fromJSON(url_json) class(superheros)
## [1] "list"
#print(superheros) #str(superheros) View(superheros)
#df <- as.data.frame(superheros) df <- jsonlite::fromJSON(url_json) %>% as.data.frame print(df)
## squadName homeTown formed secretBase active members.name ## 1 Super Hero Squad Metro City 2016 Super tower TRUE Molecule Man ## 2 Super Hero Squad Metro City 2016 Super tower TRUE Madame Uppercut ## 3 Super Hero Squad Metro City 2016 Super tower TRUE Eternal Flame ## members.age members.secretIdentity ## 1 29 Dan Jukes ## 2 39 Jane Wilson ## 3 1000000 Unknown ## members.powers ## 1 Radiation resistance, Turning tiny, Radiation blast ## 2 Million tonne punch, Damage resistance, Superhuman reflexes ## 3 Immortality, Heat Immunity, Inferno, Teleportation, Interdimensional travel
# install.packages("devtools") # devtools::install_github("blmoore/rjsonpath") # library(rjsonpath) # df2 <- read_json(url_json) # json_path(df2, "$.members[*].name") # json_path(df2, "$..name")
This lab assignment involves 2 tasks (see the next 2 slides, scroll to the bottom for instructions).
Once you finish the following tasks, please put everything in one single R file with the file name assignment1.R
(.R is the file extension) and upload it to iCollege (Lab Assignment 1).
Caution:
In addition, lab assignments will be graded based on:
## $name ## [1] "Alex" "Bob" "Claire" "Denise" ## ## $female ## [1] FALSE FALSE TRUE TRUE ## ## $age ## [1] 20 25 30 35
## [1] "Bob"
## name female age ## row_1 Alex FALSE 20 ## row_2 Bob FALSE 25 ## row_3 Claire TRUE 30 ## row_4 Denise TRUE 35
## [1] 27.5
## [1] 30