order

sort A-Z

sort(c (3, 1, 2, 3, 5, 4, 2), decreasing = FALSE)
## [1] 1 2 2 3 3 4 5

sort Z-A

sort(c (3, 1, 2, 3, 5, 4, 2), decreasing = TRUE)
## [1] 5 4 3 3 2 2 1

rank

rank(c (3, 1, 2, 3, 5, 4, 2))
## [1] 4.5 1.0 2.5 4.5 7.0 6.0 2.5

reorder

c(3, 1, 2, 3, 5, 4, 2)[order(c (3, 1, 2, 3, 5, 4, 2))]
## [1] 1 2 2 3 3 4 5

index

order(c (3, 1, 2, 3, 5, 4, 2))
## [1] 2 3 7 1 4 6 5

operation

addition

c(0, 2, 2.4) + c(1, -2, 0.6)
## [1] 1 0 3

subtraction

c(0, 2, 2.4) - c(1, -2, 0.6)
## [1] -1.0  4.0  1.8

multiplication

c(0, 2, 2.4) * c(1, -2, 0.6)
## [1]  0.00 -4.00  1.44

division

c(0, 2, 2.4) / c(1, -2, 0.6)
## [1]  0 -1  4

vectorized operations

# Vectorized AND
c(TRUE, TRUE, FALSE, FALSE) & c(T, F, T, F)
## [1]  TRUE FALSE FALSE FALSE
# Vectorized OR d
c(TRUE, TRUE, FALSE, FALSE) | c(T, F, T, F)
## [1]  TRUE  TRUE  TRUE FALSE
# exclusive logical OR
xor(c(TRUE, TRUE, FALSE, FALSE), c(T, F, T, F))
## [1] FALSE  TRUE  TRUE FALSE
# Using %in% operator
x <- c(1, 2, 3, 4, 5)
y <- c(2, 4, 6)
x %in% y
## [1] FALSE  TRUE FALSE  TRUE FALSE

range

# range in R (mathematical)
x=c(1,2,3,4,5)
range(x)
## [1] 1 5
#[1] 1 5
# range in R (NA)
x=c(1,2,NA,4,5)
range(x,na.rm=FALSE)
## [1] NA NA
#[1] NA NA
# range in R (na.rm)
range(x,na.rm=TRUE)
## [1] 1 5
#[1] 1 5
# range in R (alphabetical)
x=c("a","b","c","d","e","f","g")
range(x)
## [1] "a" "g"
#[1] "a" "g"

sample

set.seed(seed = 10)

#random sampling without replacement
sample(x = 1:10, size = 10, replace = FALSE)
##  [1]  9  7  8  6  3  2 10  5  4  1
set.seed(seed = 10)

#random sampling with replacement
sample(x = 1:10, size = 10, replace = TRUE)
##  [1]  9 10  7  8  6  7  3  8 10  7

sequence

seq(from=1,to=10)
##  [1]  1  2  3  4  5  6  7  8  9 10
seq(1.0,10.0)
##  [1]  1  2  3  4  5  6  7  8  9 10
seq(-1,-10)
##  [1]  -1  -2  -3  -4  -5  -6  -7  -8  -9 -10
seq(from=1,to=10,by=2)
## [1] 1 3 5 7 9
seq(from=5,to=50,by=5)
##  [1]  5 10 15 20 25 30 35 40 45 50
seq(5,50,5)
##  [1]  5 10 15 20 25 30 35 40 45 50
seq.int(from=5,to=50,length.out=10)
##  [1]  5 10 15 20 25 30 35 40 45 50
seq(from=-5,to=-50,length.out=10)
##  [1]  -5 -10 -15 -20 -25 -30 -35 -40 -45 -50
y<-c(5,10,15,20)
seq(1,25,along.with = y)
## [1]  1  9 17 25
df<-c(-1,-5,-5,-1)
seq(-5,5,along.with = df)
## [1] -5.000000 -1.666667  1.666667  5.000000
seq_len(10)
##  [1]  1  2  3  4  5  6  7  8  9 10
seq.int(-5,5)
##  [1] -5 -4 -3 -2 -1  0  1  2  3  4  5

string and substring

collapse

library(stringr)
str_flatten(string = c("pear", "orange", "berry", "nut"))
## [1] "pearorangeberrynut"

combination

library(stringr)
str_pad(string = c("pear", "orange", "berry", "nut"), width = 6, side = "both", pad = "Z")
## [1] "ZpearZ" "orange" "berryZ" "ZnutZZ"

concatenate

library(stringr)
cat("This line has a \t tab in it. \n Now, we have skipped a line and included ê.")
## This line has a   tab in it. 
##  Now, we have skipped a line and included ê.

duplicate

library(stringr)
str_dup(string = c("pear", "orange", "berry", "nut"), times = c(1, 2, 3, 4))
## [1] "pear"            "orangeorange"    "berryberryberry" "nutnutnutnut"

extract

x <- c("abcdefg", "ABCDEFGH", "123456789")
str_sub(string = x, start = 2, end = 5)
## [1] "bcde" "BCDE" "2345"
str_sub(string = x, end = -4)
## [1] "abcd"   "ABCDE"  "123456"

print

library(stringr)
print("This is a string.")
## [1] "This is a string."
noquote("This is a string without printed quotes.")
## [1] This is a string without printed quotes.

join

library(stringr)
stringr::str_c("Number: ", c(1, 2, 3, 4, 5, 6))
## [1] "Number: 1" "Number: 2" "Number: 3" "Number: 4" "Number: 5" "Number: 6"

length

library(stringr)
str_length(string = c("pear", "orange", "berry", "nut"))
## [1] 4 6 5 3

lower and upper

library(stringr)
tolower("ChanGE cAsE 1234")
## [1] "change case 1234"
toupper("ChanGE cAsE 1234")
## [1] "CHANGE CASE 1234"
chartr(old = "EhC", new = "eHX", "ChanGE cAsE 1234")
## [1] "XHanGe cAse 1234"
chartr(old = "abcX", new = "DEFx", "abcdefghijklmnopqrstuvwXyz")
## [1] "DEFdefghijklmnopqrstuvwxyz"

split

library(stringr)
strsplit("Age , Height , Weight , Other", split = ",")
## [[1]]
## [1] "Age "     " Height " " Weight " " Other"

substring

library(stringr)
substring("abcdefghijklmnopqrstuvwxyz", first = 2, last = 5)
## [1] "bcde"

trim

library(stringr)
x <- "   <--Extra Spaces Here-->   <--Here-->   "
str_trim(string = x, side = "both")
## [1] "<--Extra Spaces Here-->   <--Here-->"
str_squish(string = x)
## [1] "<--Extra Spaces Here--> <--Here-->"

truncation

library(stringr)
x <- "This string has a lot of characters in it."
str_trunc(string = x, width = 11, side = "right", ellipsis = ".....")
## [1] "This s....."

for loop & while loop

#certain
for (i in 1:10) {
  print(i)
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
#uncertain
i <- 1
while(i<=10) {
  print(i)
  i <- i + 1
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
x <- 0
i <- 0
while (x < 2) {
  x <- rnorm(1,0,1)
  i <- i + 1
}
print(x)
## [1] 2.137767
print(i)
## [1] 46

if-else

x <- 0
if (x < 0) {
print("Negative number")
} else if (x > 0) {
print("Positive number")
} else
print("Zero")
## [1] "Zero"
f_to_c_message <- function(F) {
C <- (F - 32) * 5 / 9
if (C < -20) {
  print("so cold")
}
else if (C > 30) {
  print("so hot")
} 
else {
  print(C)
}
}
f_to_c_message(-10)
## [1] "so cold"
f_to_c_message(50)
## [1] 10
f_to_c_message(110)
## [1] "so hot"

system time

x <- 1:10
y <- 11:20
z <- NULL
system.time(for (i in seq_along(x)) {
  z <- c(z, x[i] + y[i])
})
##    user  system elapsed 
##   0.005   0.000   0.006
z 
##  [1] 12 14 16 18 20 22 24 26 28 30
x <- 1:10
y <- 11:20
z <- NULL
system.time(for (i in range(x)) {
  z <- c(z, x[i] + y[i])
})
##    user  system elapsed 
##   0.004   0.000   0.005
z 
## [1] 12 30
x <- 1:10
y <- 11:20
z <- vector(length=10)
system.time(for (i in 1:length(x)){
  z[i] <- x[i] + y[i]
})
##    user  system elapsed 
##   0.008   0.000   0.009
z
##  [1] 12 14 16 18 20 22 24 26 28 30
x <- 1:10
y <- 11:20
z <- vector(mode = "double", length=10)
system.time(for (i in 1:length(x)){
  z[i] <- x[i] + y[i]
})
##    user  system elapsed 
##   0.005   0.000   0.005
z
##  [1] 12 14 16 18 20 22 24 26 28 30

distribution

x=rnorm(50)
y=x+rnorm(50,mean=50,sd=.1)
cor(x,y)
## [1] 0.9945578
set.seed(100)
x = rnorm(100)
x
##   [1] -0.50219235  0.13153117 -0.07891709  0.88678481  0.11697127  0.31863009
##   [7] -0.58179068  0.71453271 -0.82525943 -0.35986213  0.08988614  0.09627446
##  [13] -0.20163395  0.73984050  0.12337950 -0.02931671 -0.38885425  0.51085626
##  [19] -0.91381419  2.31029682 -0.43808998  0.76406062  0.26196129  0.77340460
##  [25] -0.81437912 -0.43845057 -0.72022155  0.23094453 -1.15772946  0.24707599
##  [31] -0.09111356  1.75737562 -0.13792961 -0.11119350 -0.69001432 -0.22179423
##  [37]  0.18290768  0.41732329  1.06540233  0.97020202 -0.10162924  1.40320349
##  [43] -1.77677563  0.62286739 -0.52228335  1.32223096 -0.36344033  1.31906574
##  [49]  0.04377907 -1.87865588 -0.44706218 -1.73859795  0.17886485  1.89746570
##  [55] -2.27192549  0.98046414 -1.39882562  1.82487242  1.38129873 -0.83885188
##  [61] -0.26199577 -0.06884403 -0.37888356  2.58195893  0.12983414 -0.71302498
##  [67]  0.63799424  0.20169159 -0.06991695 -0.09248988  0.44890327 -1.06435567
##  [73] -1.16241932  1.64852175 -2.06209602  0.01274972 -1.08752835  0.27053949
##  [79]  1.00845187 -2.07440475  0.89682227 -0.04999577 -1.34534931 -1.93121153
##  [85]  0.70958158 -0.15790503  0.21636787  0.81736208  1.72717575 -0.10377029
##  [91] -0.55712229  1.42830143 -0.89295740 -1.15757124 -0.53029645  2.44568276
##  [97] -0.83249580  0.41351985 -1.17868314 -1.17403476

mean

mean(x)
## [1] 0.002912563

variance

var(x)
## [1] 1.04185

standard deviation

sqrt(var(x))
## [1] 1.02071
sd(x)
## [1] 1.02071

bias

#install.packages("SimDesign")
library(SimDesign)
actual_temp <- c(68.3, 70, 72.4, 71, 67, 70)
predicted_temp <- c(67.9, 69, 71.5, 70, 67, 69)
bias(actual_temp,predicted_temp)
## [1] 0.7166667
actual_temp <- c(150, 203, 137, 247, 116, 287)
predicted_temp <- c(200, 300, 150, 250, 150, 300)
bias(actual_temp,predicted_temp)
## [1] -35

graphics

x=rnorm(100)
y=rnorm(100)
plot(x,y)

plot(x,y,xlab="this is the x-axis",ylab="this is the y-axis",
main="Plot of X vs Y")

dual axis

library(latticeExtra)
## Loading required package: lattice
# create data
set.seed(1)
x <- 1:100
var1 <- cumsum(rnorm(100))
var2 <- var1^2
data <- data.frame(x,var1,var2)
 
 
# usual line chart
xyplot(var1 + var2 ~ x, data, type = "l", col=c("steelblue", "#69b3a2") , lwd=2)

#library
library(latticeExtra)
 
# create data
set.seed(1)
x <- 1:100
var1 <- cumsum(rnorm(100))
var2 <- var1^2
data <- data.frame(x,var1,var2)
 
 
# --> construct separate plots for each series
obj1 <- xyplot(var1 ~ x, data, type = "l" , lwd=2, col="steelblue")
obj2 <- xyplot(var2 ~ x, data, type = "l", lwd=2, col="#69b3a2")
 
# --> Make the plot with second y axis:
doubleYScale(obj1, obj2, add.ylab2 = TRUE, use.style=FALSE )

#library
library(latticeExtra)
 
# create data
set.seed(1)
x <- 1:100
var1 <- cumsum(rnorm(100))
var2 <- var1^2
data <- data.frame(x,var1,var2)
 
 
# --> construct separate plots for each series
obj1 <- xyplot(var1 ~ x, data, type = "l" , lwd=2)
obj2 <- xyplot(var2 ~ x, data, type = "l", lwd=2)
 
# --> Make the plot with second y axis AND legend:
doubleYScale(obj1, obj2, text = c("obj1", "obj2") , add.ylab2 = TRUE)

#library
library(latticeExtra)
 
# create data
set.seed(1)
x <- 1:100
var1 <- cumsum(rnorm(100))
var2 <- var1^2
data <- data.frame(x,var1,var2)
 
 
# --> construct separate plots for each series
obj1 <- xyplot(var1 ~ x, data, type = "l" , lwd=2)
obj2 <- xyplot(var2 ~ x, data, type = "l", lwd=2)
 
# --> Make the plot with second y axis AND legend:
doubleYScale(obj1, obj2, text = c("obj1", "obj2") , add.ylab2 = TRUE)

filter

library(tibble)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#Create a sample dataset using the `tibble` function from the `tibble` package.
my_data <- tibble(
  Name = c("Alice", "Bob", "Carol", "Dave"),
  Age = c(25, 30, 35, 40),
  Gender = c("Female", "Male", "Female", "Male")
)
my_data 
## # A tibble: 4 × 3
##   Name    Age Gender
##   <chr> <dbl> <chr> 
## 1 Alice    25 Female
## 2 Bob      30 Male  
## 3 Carol    35 Female
## 4 Dave     40 Male
#Create a filter to the dataset using the `filter` function from `dplyr`.
my_filtered_data <- my_data %>%
  filter(Age > 30)
my_filtered_data
## # A tibble: 2 × 3
##   Name    Age Gender
##   <chr> <dbl> <chr> 
## 1 Carol    35 Female
## 2 Dave     40 Male
#Create a nested filter function by applying two `filter` functions sequentially using `%>%`.
my_nested_filtered_data <- my_data %>%
  filter(Gender == "Female") %>%
  filter(Age > 30)
my_nested_filtered_data
## # A tibble: 1 × 3
##   Name    Age Gender
##   <chr> <dbl> <chr> 
## 1 Carol    35 Female

time

#define two datetimes
first <- "2022-08-20 08:15:22"
second <- "2022-01-01 20:04:48"

#calculate time difference in days
difftime(first, second)
## Time difference of 230.4657 days
#Time difference of 230.5073 days

#calculate time difference in seconds
difftime(first, second, units="secs")
## Time difference of 19912234 secs
#Time difference of 19915834 secs

#calculate time difference in minutes
difftime(first, second, units="mins")
## Time difference of 331870.6 mins
#Time difference of 331930.6 mins

#calculate time difference in hours
difftime(first, second, units="hours")
## Time difference of 5531.176 hours
#Time difference of 5532.176 hours

#calculate time difference in weeks
difftime(first, second, units="weeks")
## Time difference of 32.92367 weeks
#Time difference of 32.92962 weeks
library(hms)

#define two datetimes
first <- "2022-01-01 20:15:22"
second <- "2022-01-01 08:04:48"

#calculate difference between datetimes in hours, minutes, seconds
as_hms(difftime(first, second))
## 12:10:34

Graph

library(igraph)
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following object is masked from 'package:tibble':
## 
##     as_data_frame
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
library(igraphdata)

data(UKfaculty)
plot(UKfaculty)

Clustering

clustering <- cluster_leiden(as.undirected(UKfaculty))
plot(clustering, UKfaculty, edge.arrow.size = 0.5, cex = 0.5)

clustering <- cluster_leiden(as.undirected(UKfaculty), objective_function = "modularity")
plot(clustering, UKfaculty, edge.arrow.size = 0.5, cex = 0.5)

clustering <- cluster_leiden(as.undirected(UKfaculty), objective_function = "modularity", resolution_parameter = 0.8)
plot(clustering, UKfaculty, edge.arrow.size = 0.5, cex = 0.5)

Data Manipulation and Data Processing

create table

# Example data
gender <- c("Male", "Female", "Male", "Female", "Male", "Male")
occupation <- c("Engineer", "Teacher", "Engineer", "Doctor", "Teacher", "Doctor")

# Create a contingency table
my_table <- table(gender, occupation)

# Display the table
print(my_table)
##         occupation
## gender   Doctor Engineer Teacher
##   Female      1        0       1
##   Male        1        2       1

create randomly-generated table

# Set the seed for reproducibility
set.seed(123)

# Generate random gender and occupation data
n <- 6  # Number of data points
genders <- sample(c("Male", "Female"), n, replace = TRUE)
occupations <- sample(c("Engineer", "Teacher", "Doctor"), n, replace = TRUE)

# Create a contingency table
my_table <- table(genders, occupations)

# Display the table
print(my_table)
##         occupations
## genders  Doctor Engineer Teacher
##   Female      0        1       1
##   Male        1        0       3

create data frame

# Example data
name <- c("John", "Jane", "Michael", "Emma")
age <- c(25, 30, 22, 28)
city <- c("New York", "Los Angeles", "Chicago", "San Francisco")

# Create a data frame
my_data_frame <- data.frame(Name = name, Age = age, City = city)

# Display the table
print(my_data_frame)
##      Name Age          City
## 1    John  25      New York
## 2    Jane  30   Los Angeles
## 3 Michael  22       Chicago
## 4    Emma  28 San Francisco

create randomly-generated data frame

# Set the seed for reproducibility
set.seed(123)

# Create a 5x5 data frame
data <- data.frame(
  Letters = c(sample(letters, 3, replace = TRUE), rep(sample(letters, 1), 2)),
  Numbers1 = round(runif(5, min = 0, max = 10), 1),
  Numbers2 = c(1.1, 2.2, 3.3, 4.4, 5.5),  # Non-random values
  Numbers3 = c(0.5, 1.0, 1.5, 2.0, 2.5),  # Non-random values
  Numbers4 = c(0.1, 0.2, 0.3, 0.4, 0.5)  # Non-random values
)

# Print the data frame
print(data)
##   Letters Numbers1 Numbers2 Numbers3 Numbers4
## 1       o      0.5      1.1      0.5      0.1
## 2       s      5.3      2.2      1.0      0.2
## 3       n      8.9      3.3      1.5      0.3
## 4       c      5.5      4.4      2.0      0.4
## 5       c      4.6      5.5      2.5      0.5

countif

# Count the occurrences of each letter
letter_counts <- table(data$Letters)

# Print the letter counts
print(letter_counts)
## 
## c n o s 
## 2 1 1 1
# Count the occurrences of each letter
letter_counts <- table(data$Letters)

# Create a data frame to display the counts
letter_counts_df <- data.frame(Letter = names(letter_counts), Count = as.vector(letter_counts))

# Print the data frame with counts
print(letter_counts_df)
##   Letter Count
## 1      c     2
## 2      n     1
## 3      o     1
## 4      s     1

sumif

# Load the dplyr package
library(dplyr)

# Assuming you have already created the 'data' data frame as shown in your code

# Calculate the sum for each letter
result <- data %>%
  group_by(Letters) %>%
  summarize(
    Sum_Numbers1 = sum(Numbers1),
    Sum_Numbers2 = sum(Numbers2),
    Sum_Numbers3 = sum(Numbers3),
    Sum_Numbers4 = sum(Numbers4)
  )

# Print the result
print(result)
## # A tibble: 4 × 5
##   Letters Sum_Numbers1 Sum_Numbers2 Sum_Numbers3 Sum_Numbers4
##   <chr>          <dbl>        <dbl>        <dbl>        <dbl>
## 1 c               10.1          9.9          4.5          0.9
## 2 n                8.9          3.3          1.5          0.3
## 3 o                0.5          1.1          0.5          0.1
## 4 s                5.3          2.2          1            0.2

add row

# Example data frame
name <- c("John", "Jane", "Michael", "Emma")
age <- c(25, 30, 22, 28)
city <- c("New York", "Los Angeles", "Chicago", "San Francisco")

my_table <- data.frame(Name = name, Age = age, City = city)

# New row data
new_row <- data.frame(Name = "Jake", Age = 35, City = "Boston")

# Add the new row to the data frame
row_added_table <- rbind(my_table, new_row)

# Display the updated table
print(row_added_table)
##      Name Age          City
## 1    John  25      New York
## 2    Jane  30   Los Angeles
## 3 Michael  22       Chicago
## 4    Emma  28 San Francisco
## 5    Jake  35        Boston

delete row

# Remove the second row
row_deleted_table <- my_table[-2, ]

# Display the updated table
print(row_deleted_table)
##      Name Age          City
## 1    John  25      New York
## 3 Michael  22       Chicago
## 4    Emma  28 San Francisco

add column

# Example data frame
name <- c("John", "Jane", "Michael", "Emma")
age <- c(25, 30, 22, 28)
city <- c("New York", "Los Angeles", "Chicago", "San Francisco")

column_added_table <- data.frame(Name = name, Age = age, City = city)

# New column data
new_column <- c("USA", "USA", "USA", "USA")

# Add the new column to the data frame
column_added_table$Country <- new_column

# Display the updated table
print(column_added_table)
##      Name Age          City Country
## 1    John  25      New York     USA
## 2    Jane  30   Los Angeles     USA
## 3 Michael  22       Chicago     USA
## 4    Emma  28 San Francisco     USA

delete column

# Remove the 'City' column
column_deleted_table <- my_table[, -which(names(my_table) == "City")]

# Display the updated table
print(column_deleted_table)
##      Name Age
## 1    John  25
## 2    Jane  30
## 3 Michael  22
## 4    Emma  28

left join

# left join
# Sample data frames
df1 <- data.frame(ID = c(1, 2, 3, 4, 5),
                  Name = c("Alice", "Bob", "Charlie", "David", "Eve"))

df2 <- data.frame(ID = c(1, 3, 5, 6),
                  Age = c(25, 30, 27, 22))

# Perform a left join
result <- merge(df1, df2, by = "ID", all.x = TRUE)

print(df1)
##   ID    Name
## 1  1   Alice
## 2  2     Bob
## 3  3 Charlie
## 4  4   David
## 5  5     Eve
print(df2)
##   ID Age
## 1  1  25
## 2  3  30
## 3  5  27
## 4  6  22
print(result)
##   ID    Name Age
## 1  1   Alice  25
## 2  2     Bob  NA
## 3  3 Charlie  30
## 4  4   David  NA
## 5  5     Eve  27

right join

# right join
# Sample data frames
df1 <- data.frame(ID = c(1, 2, 3, 4, 5),
                  Name = c("Alice", "Bob", "Charlie", "David", "Eva"))

df2 <- data.frame(ID = c(3, 4, 5, 6, 7),
                  Age = c(25, 30, 28, 22, 27))

# Perform right join on 'ID' column
result <- merge(x = df1, y = df2, by = "ID", all.x = FALSE, all.y = TRUE)

print(result)
##   ID    Name Age
## 1  3 Charlie  25
## 2  4   David  30
## 3  5     Eva  28
## 4  6    <NA>  22
## 5  7    <NA>  27

inner join

# inner join
# Sample data frame 1
df1 <- data.frame(ID = c(1, 2, 3, 4, 5),
                  Name = c("Alice", "Bob", "Charlie", "David", "Eva"))

# Sample data frame 2
df2 <- data.frame(ID = c(2, 4, 5, 6, 7),
                  Age = c(25, 30, 28, 22, 27))

# Inner join based on 'ID'
inner_joined <- merge(df1, df2, by = "ID")

print(inner_joined)
##   ID  Name Age
## 1  2   Bob  25
## 2  4 David  30
## 3  5   Eva  28

outer join

# outer join
# Sample data frames
df1 <- data.frame(ID = c(1, 2, 3, 4),
                  Name = c("Alice", "Bob", "Charlie", "David"))

df2 <- data.frame(ID = c(2, 3, 5),
                  Age = c(25, 30, 22))

# Perform an outer join on 'ID' column
merged_df <- merge(df1, df2, by = "ID", all = TRUE)

print(merged_df)
##   ID    Name Age
## 1  1   Alice  NA
## 2  2     Bob  25
## 3  3 Charlie  30
## 4  4   David  NA
## 5  5    <NA>  22

semi join

# semi join
# Load the dplyr package
library(dplyr)

# Example data frame 1
df1 <- data.frame(
  ID = c(1, 2, 3, 4, 5),
  Name = c("Alice", "Bob", "Charlie", "David", "Eve")
)

# Example data frame 2
df2 <- data.frame(
  ID = c(2, 4, 5),
  Age = c(25, 30, 27)
)

# Perform the semi-join based on the "ID" column
left_semi_join_result <- semi_join(df1, df2, by = "ID")
right_semi_join_result <- semi_join(df2, df1, by = "ID")

# Print the result
print(left_semi_join_result)
##   ID  Name
## 1  2   Bob
## 2  4 David
## 3  5   Eve
print(right_semi_join_result)
##   ID Age
## 1  2  25
## 2  4  30
## 3  5  27

anti join

# anti join
# Load the dplyr package
library(dplyr)

# Sample data: two data frames
left_df <- data.frame(ID = c(1, 2, 3, 4, 5),
                      Value = c("A", "B", "C", "D", "E"))

right_df <- data.frame(ID = c(3, 4),
                       Value = c("C", "D"))

# Performing the anti-join
left_anti_join_result <- anti_join(left_df, right_df, by = "ID")
right_anti_join_result <- anti_join(left_df, right_df, by = "ID")

# Output the result
print(left_anti_join_result)
##   ID Value
## 1  1     A
## 2  2     B
## 3  5     E
print(right_anti_join_result)
##   ID Value
## 1  1     A
## 2  2     B
## 3  5     E

Machine Learning

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(ISLR)
names(Boston)
##  [1] "crim"    "zn"      "indus"   "chas"    "nox"     "rm"      "age"    
##  [8] "dis"     "rad"     "tax"     "ptratio" "black"   "lstat"   "medv"

Simple Linear Regression

lm.fit=lm(medv~lstat,data=Boston)
lm.fit
## 
## Call:
## lm(formula = medv ~ lstat, data = Boston)
## 
## Coefficients:
## (Intercept)        lstat  
##       34.55        -0.95
names(lm.fit)
##  [1] "coefficients"  "residuals"     "effects"       "rank"         
##  [5] "fitted.values" "assign"        "qr"            "df.residual"  
##  [9] "xlevels"       "call"          "terms"         "model"
coef(lm.fit)
## (Intercept)       lstat 
##  34.5538409  -0.9500494
confint(lm.fit)
##                 2.5 %     97.5 %
## (Intercept) 33.448457 35.6592247
## lstat       -1.026148 -0.8739505
predict(lm.fit,data.frame(lstat=(c(5,10 ,15))),interval ="confidence")
##        fit      lwr      upr
## 1 29.80359 29.00741 30.59978
## 2 25.05335 24.47413 25.63256
## 3 20.30310 19.73159 20.87461
predict(lm.fit,data.frame(lstat=(c(5,10 ,15))),interval ="prediction")
##        fit       lwr      upr
## 1 29.80359 17.565675 42.04151
## 2 25.05335 12.827626 37.27907
## 3 20.30310  8.077742 32.52846

Multiple Linear Regression

library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
lm.fit1=lm(medv~lstat+age,data=Boston)
summary(lm.fit1)
## 
## Call:
## lm(formula = medv ~ lstat + age, data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.981  -3.978  -1.283   1.968  23.158 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 33.22276    0.73085  45.458  < 2e-16 ***
## lstat       -1.03207    0.04819 -21.416  < 2e-16 ***
## age          0.03454    0.01223   2.826  0.00491 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.173 on 503 degrees of freedom
## Multiple R-squared:  0.5513, Adjusted R-squared:  0.5495 
## F-statistic:   309 on 2 and 503 DF,  p-value: < 2.2e-16
names(lm.fit1)
##  [1] "coefficients"  "residuals"     "effects"       "rank"         
##  [5] "fitted.values" "assign"        "qr"            "df.residual"  
##  [9] "xlevels"       "call"          "terms"         "model"
coef(lm.fit1)
## (Intercept)       lstat         age 
## 33.22276053 -1.03206856  0.03454434
confint(lm.fit1)
##                   2.5 %      97.5 %
## (Intercept) 31.78687150 34.65864956
## lstat       -1.12674848 -0.93738865
## age          0.01052507  0.05856361
vif(lm.fit1)
##    lstat      age 
## 1.569395 1.569395
lm.fit2=lm(medv~.,data=Boston)
summary(lm.fit2)
## 
## Call:
## lm(formula = medv ~ ., data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.595  -2.730  -0.518   1.777  26.199 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.646e+01  5.103e+00   7.144 3.28e-12 ***
## crim        -1.080e-01  3.286e-02  -3.287 0.001087 ** 
## zn           4.642e-02  1.373e-02   3.382 0.000778 ***
## indus        2.056e-02  6.150e-02   0.334 0.738288    
## chas         2.687e+00  8.616e-01   3.118 0.001925 ** 
## nox         -1.777e+01  3.820e+00  -4.651 4.25e-06 ***
## rm           3.810e+00  4.179e-01   9.116  < 2e-16 ***
## age          6.922e-04  1.321e-02   0.052 0.958229    
## dis         -1.476e+00  1.995e-01  -7.398 6.01e-13 ***
## rad          3.060e-01  6.635e-02   4.613 5.07e-06 ***
## tax         -1.233e-02  3.760e-03  -3.280 0.001112 ** 
## ptratio     -9.527e-01  1.308e-01  -7.283 1.31e-12 ***
## black        9.312e-03  2.686e-03   3.467 0.000573 ***
## lstat       -5.248e-01  5.072e-02 -10.347  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.745 on 492 degrees of freedom
## Multiple R-squared:  0.7406, Adjusted R-squared:  0.7338 
## F-statistic: 108.1 on 13 and 492 DF,  p-value: < 2.2e-16
names(lm.fit2)
##  [1] "coefficients"  "residuals"     "effects"       "rank"         
##  [5] "fitted.values" "assign"        "qr"            "df.residual"  
##  [9] "xlevels"       "call"          "terms"         "model"
coef(lm.fit2)
##   (Intercept)          crim            zn         indus          chas 
##  3.645949e+01 -1.080114e-01  4.642046e-02  2.055863e-02  2.686734e+00 
##           nox            rm           age           dis           rad 
## -1.776661e+01  3.809865e+00  6.922246e-04 -1.475567e+00  3.060495e-01 
##           tax       ptratio         black         lstat 
## -1.233459e-02 -9.527472e-01  9.311683e-03 -5.247584e-01
confint(lm.fit2)
##                     2.5 %        97.5 %
## (Intercept)  26.432226009  46.486750761
## crim         -0.172584412  -0.043438304
## zn            0.019448778   0.073392139
## indus        -0.100267941   0.141385193
## chas          0.993904193   4.379563446
## nox         -25.271633564 -10.261588893
## rm            2.988726773   4.631003640
## age          -0.025262320   0.026646769
## dis          -1.867454981  -1.083678710
## rad           0.175692169   0.436406789
## tax          -0.019723286  -0.004945902
## ptratio      -1.209795296  -0.695699168
## black         0.004034306   0.014589060
## lstat        -0.624403622  -0.425113133
vif(lm.fit2)
##     crim       zn    indus     chas      nox       rm      age      dis 
## 1.792192 2.298758 3.991596 1.073995 4.393720 1.933744 3.100826 3.955945 
##      rad      tax  ptratio    black    lstat 
## 7.484496 9.008554 1.799084 1.348521 2.941491
lm.fit3=lm(medv~.-age,data=Boston)
summary(lm.fit3)
## 
## Call:
## lm(formula = medv ~ . - age, data = Boston)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.6054  -2.7313  -0.5188   1.7601  26.2243 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  36.436927   5.080119   7.172 2.72e-12 ***
## crim         -0.108006   0.032832  -3.290 0.001075 ** 
## zn            0.046334   0.013613   3.404 0.000719 ***
## indus         0.020562   0.061433   0.335 0.737989    
## chas          2.689026   0.859598   3.128 0.001863 ** 
## nox         -17.713540   3.679308  -4.814 1.97e-06 ***
## rm            3.814394   0.408480   9.338  < 2e-16 ***
## dis          -1.478612   0.190611  -7.757 5.03e-14 ***
## rad           0.305786   0.066089   4.627 4.75e-06 ***
## tax          -0.012329   0.003755  -3.283 0.001099 ** 
## ptratio      -0.952211   0.130294  -7.308 1.10e-12 ***
## black         0.009321   0.002678   3.481 0.000544 ***
## lstat        -0.523852   0.047625 -10.999  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.74 on 493 degrees of freedom
## Multiple R-squared:  0.7406, Adjusted R-squared:  0.7343 
## F-statistic: 117.3 on 12 and 493 DF,  p-value: < 2.2e-16
names(lm.fit3)
##  [1] "coefficients"  "residuals"     "effects"       "rank"         
##  [5] "fitted.values" "assign"        "qr"            "df.residual"  
##  [9] "xlevels"       "call"          "terms"         "model"
coef(lm.fit3)
##   (Intercept)          crim            zn         indus          chas 
##  36.436926648  -0.108005604   0.046333661   0.020562177   2.689026199 
##           nox            rm           dis           rad           tax 
## -17.713539860   3.814393564  -1.478611555   0.305785940  -0.012328692 
##       ptratio         black         lstat 
##  -0.952211173   0.009320653  -0.523851840
confint(lm.fit3)
##                    2.5 %        97.5 %
## (Intercept)  26.45557192  46.418281372
## crim         -0.17251263  -0.043498576
## zn            0.01958627   0.073081052
## indus        -0.10014145   0.141265800
## chas          1.00009928   4.377953122
## nox         -24.94259876 -10.484480962
## rm            3.01181752   4.616969603
## dis          -1.85312219  -1.104100922
## rad           0.17593586   0.435636023
## tax          -0.01970656  -0.004950825
## ptratio      -1.20821163  -0.696210713
## black         0.00405936   0.014581947
## lstat        -0.61742541  -0.430278274
vif(lm.fit3)
##     crim       zn    indus     chas      nox       rm      dis      rad 
## 1.792172 2.265290 3.991592 1.071227 4.084846 1.851068 3.620246 7.441492 
##      tax  ptratio    black    lstat 
## 9.000474 1.788084 1.343044 2.599229

Interaction Terms

summary(lm(medv~lstat*age,data=Boston))
## 
## Call:
## lm(formula = medv ~ lstat * age, data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.806  -4.045  -1.333   2.085  27.552 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 36.0885359  1.4698355  24.553  < 2e-16 ***
## lstat       -1.3921168  0.1674555  -8.313 8.78e-16 ***
## age         -0.0007209  0.0198792  -0.036   0.9711    
## lstat:age    0.0041560  0.0018518   2.244   0.0252 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.149 on 502 degrees of freedom
## Multiple R-squared:  0.5557, Adjusted R-squared:  0.5531 
## F-statistic: 209.3 on 3 and 502 DF,  p-value: < 2.2e-16

Non-Linear Terms

lm.fit4=lm(medv~lstat+I(lstat^2),data=Boston)
summary(lm.fit4)
## 
## Call:
## lm(formula = medv ~ lstat + I(lstat^2), data = Boston)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.2834  -3.8313  -0.5295   2.3095  25.4148 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 42.862007   0.872084   49.15   <2e-16 ***
## lstat       -2.332821   0.123803  -18.84   <2e-16 ***
## I(lstat^2)   0.043547   0.003745   11.63   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.524 on 503 degrees of freedom
## Multiple R-squared:  0.6407, Adjusted R-squared:  0.6393 
## F-statistic: 448.5 on 2 and 503 DF,  p-value: < 2.2e-16
lm.fit5=lm(Boston$medv~poly(Boston$lstat,5))
summary(lm.fit5)
## 
## Call:
## lm(formula = Boston$medv ~ poly(Boston$lstat, 5))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.5433  -3.1039  -0.7052   2.0844  27.1153 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              22.5328     0.2318  97.197  < 2e-16 ***
## poly(Boston$lstat, 5)1 -152.4595     5.2148 -29.236  < 2e-16 ***
## poly(Boston$lstat, 5)2   64.2272     5.2148  12.316  < 2e-16 ***
## poly(Boston$lstat, 5)3  -27.0511     5.2148  -5.187 3.10e-07 ***
## poly(Boston$lstat, 5)4   25.4517     5.2148   4.881 1.42e-06 ***
## poly(Boston$lstat, 5)5  -19.2524     5.2148  -3.692 0.000247 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.215 on 500 degrees of freedom
## Multiple R-squared:  0.6817, Adjusted R-squared:  0.6785 
## F-statistic: 214.2 on 5 and 500 DF,  p-value: < 2.2e-16
summary(lm(medv~log(rm),data=Boston))
## 
## Call:
## lm(formula = medv ~ log(rm), data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -19.487  -2.875  -0.104   2.837  39.816 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -76.488      5.028  -15.21   <2e-16 ***
## log(rm)       54.055      2.739   19.73   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.915 on 504 degrees of freedom
## Multiple R-squared:  0.4358, Adjusted R-squared:  0.4347 
## F-statistic: 389.3 on 1 and 504 DF,  p-value: < 2.2e-16
glm.fit=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,data=Smarket,family=binomial)
summary(glm.fit)
## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = binomial, data = Smarket)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.126000   0.240736  -0.523    0.601
## Lag1        -0.073074   0.050167  -1.457    0.145
## Lag2        -0.042301   0.050086  -0.845    0.398
## Lag3         0.011085   0.049939   0.222    0.824
## Lag4         0.009359   0.049974   0.187    0.851
## Lag5         0.010313   0.049511   0.208    0.835
## Volume       0.135441   0.158360   0.855    0.392
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1731.2  on 1249  degrees of freedom
## Residual deviance: 1727.6  on 1243  degrees of freedom
## AIC: 1741.6
## 
## Number of Fisher Scoring iterations: 3
coef(glm.fit)
##  (Intercept)         Lag1         Lag2         Lag3         Lag4         Lag5 
## -0.126000257 -0.073073746 -0.042301344  0.011085108  0.009358938  0.010313068 
##       Volume 
##  0.135440659
summary(glm.fit)$coef
##                 Estimate Std. Error    z value  Pr(>|z|)
## (Intercept) -0.126000257 0.24073574 -0.5233966 0.6006983
## Lag1        -0.073073746 0.05016739 -1.4565986 0.1452272
## Lag2        -0.042301344 0.05008605 -0.8445733 0.3983491
## Lag3         0.011085108 0.04993854  0.2219750 0.8243333
## Lag4         0.009358938 0.04997413  0.1872757 0.8514445
## Lag5         0.010313068 0.04951146  0.2082966 0.8349974
## Volume       0.135440659 0.15835970  0.8552723 0.3924004

Geospatial Analysis

library(leaflet)
leaflet() %>% 
  addTiles() %>% 
  setView(lng = c(-74.0060), lat = c(40.7128), zoom = 5)
leaflet(options = leafletOptions(zoomControl = FALSE)) %>%
  addProviderTiles(providers$CartoDB.DarkMatter) %>% 
  setView(lng = -74.0060, lat = 40.7128, zoom = 13)
leaflet(options = leafletOptions(zoomControl = FALSE)) %>%
    addProviderTiles(providers$CartoDB.Positron) %>% 
    setView(lng = -74.0060, lat = 40.7128, zoom = 13)
library(ggmap)
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:latticeExtra':
## 
##     layer
## The legacy packages maptools, rgdal, and rgeos, underpinning this package
## will retire shortly. Please refer to R-spatial evolution reports on
## https://r-spatial.org/r/2023/05/15/evolution4.html for details.
## This package is now running under evolution status 0
## ℹ Google's Terms of Service: <https://mapsplatform.google.com>
## ℹ Please cite ggmap if you use it! Use `citation("ggmap")` for details.
usa <- map_data('usa')
ggplot(data=usa, aes(x=long, y=lat, group=group)) + 
  geom_polygon(fill='lightblue') + 
  theme(axis.title.x=element_blank(), axis.text.x=element_blank(), axis.ticks.x=element_blank(),
        axis.title.y=element_blank(), axis.text.y=element_blank(), axis.ticks.y=element_blank()) + 
  ggtitle('U.S. Map') + 
  coord_fixed(1.3)

#install.packages("maps")
library(maps)
maps::map("state", interior=FALSE)
maps::map("state", boundary=FALSE, col="gray", add=TRUE)

state <- map_data("state")
ggplot(data=state, aes(x=long, y=lat, fill=region, group=group)) + 
  geom_polygon(color = "white") + 
  guides(fill=FALSE) + 
  theme(axis.title.x=element_blank(), axis.text.x=element_blank(), axis.ticks.x=element_blank(),
        axis.title.y=element_blank(), axis.text.y=element_blank(), axis.ticks.y=element_blank()) + 
  ggtitle('U.S. Map with States') + 
  coord_fixed(1.3)

new_york <- subset(state, region=="new york")
counties <- map_data("county")
new_york_county <- subset(counties, region=="new york")
ny_map <- ggplot(data=new_york, mapping=aes(x=long, y=lat, group=group)) + 
  coord_fixed(1.3) + 
  geom_polygon(color="black", fill="gray") + 
  geom_polygon(data=new_york_county, fill=NA, color="white") + 
  geom_polygon(color="black", fill=NA) + 
  ggtitle('New York State Counties') + 
  theme(axis.title.x=element_blank(), axis.text.x=element_blank(), axis.ticks.x=element_blank(),
        axis.title.y=element_blank(), axis.text.y=element_blank(), axis.ticks.y=element_blank())
ny_map

points = sf::st_read('~/Downloads/Collisions/Collisions.shp') 
## Reading layer `Collisions' from data source 
##   `/Users/sunlei/Downloads/Collisions/Collisions.shp' using driver `ESRI Shapefile'
## replacing null geometries with empty geometries
## Simple feature collection with 236811 features and 38 fields (with 7705 geometries empty)
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: -122.4253 ymin: 47.49557 xmax: -122.2389 ymax: 47.73414
## Geodetic CRS:  WGS 84
leaflet() %>%
  addProviderTiles(providers$CartoDB.Positron) %>% 
  setView(lng = -122.330412, lat = 47.609056, zoom = 14) %>% 
  addCircleMarkers(data = points, fillColor = 'red', fillOpacity = 0.6, stroke = FALSE,
                   radius = 4, clusterOptions = markerClusterOptions())
x = sf::st_read('~/Downloads/2016_Traffic_Flow_Counts/2016_Traffic_Flow_Counts.shp') 
## Reading layer `2016_Traffic_Flow_Counts' from data source 
##   `/Users/sunlei/Downloads/2016_Traffic_Flow_Counts/2016_Traffic_Flow_Counts.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 1865 features and 14 fields
## Geometry type: MULTILINESTRING
## Dimension:     XY
## Bounding box:  xmin: -122.4134 ymin: 47.49599 xmax: -122.2372 ymax: 47.73414
## Geodetic CRS:  WGS 84
my_breaks = function(x) { cut(x, breaks = c(0, 1e4, 2.5e4, 5e4, 7.5e4, 1e5, 1.25e5),
                              labels = c(1, 2, 4, 8, 12, 16)) }
leaflet() %>%
  addProviderTiles(providers$CartoDB.DarkMatter) %>% 
  setView(lng = -122.330412, lat = 47.609056, zoom = 11) %>%  
  addPolylines(data = x, weight = ~my_breaks(AWDT_ROUND), color = 'orange', opacity = 1)

Exit Project

library(readxl)
setwd("~/Home/Exit Project")
data <- read_excel("Exit Project++.xlsx")
## New names:
## • `Cost per motorist` -> `Cost per motorist...54`
## • `Cost per motorist` -> `Cost per motorist...55`
## • `Cost per motorist` -> `Cost per motorist...57`
data
## # A tibble: 53 × 63
##    State             latitude longitude `Drove Alone` Carpooled `Public Transit`
##    <chr>                <dbl>     <dbl>         <dbl>     <dbl>            <dbl>
##  1 United States         NA        NA              NA        NA               NA
##  2 Alabama               32.8     -86.8       1776620    176668             7875
##  3 Alaska                61.4    -152.         244771     43680             6039
##  4 Arizona               33.7    -111.        2363506    336630            61372
##  5 Arkansas              35.0     -92.4       1066828    133919             5296
##  6 California            36.1    -120.       13411041   1841632           958126
##  7 Colorado              39.1    -105.        2165547    250417            89955
##  8 Connecticut           41.6     -72.8       1397871    141799            86407
##  9 Delaware              39.3     -75.5        364263     37721            11670
## 10 District of Colu…     38.9     -77.0        125308     19464           133343
## # ℹ 43 more rows
## # ℹ 57 more variables: `Public Transit Usage Percentage` <dbl>,
## #   Motorcycle <dbl>, Bicycle <dbl>, Walked <dbl>, Other <dbl>,
## #   `Worked at Home` <dbl>, Population <chr>, `2017 Population` <dbl>,
## #   `2018 Population` <dbl>, `2019 Population` <dbl>, `2020 Population` <dbl>,
## #   `2021 Population` <dbl>, `Square Miles` <chr>,
## #   `People Per Square Mile` <chr>, `Median Age` <chr>, …
data1 <- read_excel("~/Home/Exit Project/Exit Project++.xlsx", sheet = "World high-speed rail kilometer")
## New names:
## • `Year` -> `Year...1`
## • `Germany` -> `Germany...2`
## • `Japan` -> `Japan...3`
## • `China` -> `China...4`
## • `France` -> `France...5`
## • `Spain` -> `Spain...6`
## • `Italy` -> `Italy...7`
## • `US` -> `US...8`
## • `UK` -> `UK...9`
## • `South Korea` -> `South Korea...10`
## • `Uzbekistan` -> `Uzbekistan...12`
## • `Belgium` -> `Belgium...13`
## • `Netherlands` -> `Netherlands...14`
## • `Switzerland` -> `Switzerland...15`
## • `Denmark` -> `Denmark...16`
## • `Morocco` -> `Morocco...17`
## • `Saudi Arabia` -> `Saudi Arabia...18`
## • `Russia` -> `Russia...19`
## • `Turkey` -> `Turkey...20`
## • `` -> `...21`
## • `Year` -> `Year...22`
## • `Germany` -> `Germany...23`
## • `Japan` -> `Japan...24`
## • `China` -> `China...25`
## • `France` -> `France...26`
## • `Spain` -> `Spain...27`
## • `Italy` -> `Italy...28`
## • `US` -> `US...29`
## • `UK` -> `UK...30`
## • `South Korea` -> `South Korea...31`
## • `Uzbekistan` -> `Uzbekistan...32`
## • `Belgium` -> `Belgium...33`
## • `Netherlands` -> `Netherlands...34`
## • `Switzerland` -> `Switzerland...35`
## • `Denmark` -> `Denmark...36`
## • `Morocco` -> `Morocco...37`
## • `Saudi Arabia` -> `Saudi Arabia...38`
## • `Russia` -> `Russia...39`
## • `Turkey` -> `Turkey...40`
data1
## # A tibble: 58 × 40
##    Year...1 Germany...2 Japan...3 China...4 France...5 Spain...6 Italy...7
##       <dbl>       <dbl>     <dbl>     <dbl>      <dbl>     <dbl>     <dbl>
##  1     1963           0        0          0          0         0         0
##  2     1964           0      515.         0          0         0         0
##  3     1965           0      515.         0          0         0         0
##  4     1966           0      515.         0          0         0         0
##  5     1967           0      515.         0          0         0         0
##  6     1968           0      515.         0          0         0         0
##  7     1969           0      515.         0          0         0         0
##  8     1970           0      515.         0          0         0         0
##  9     1971           0      515.         0          0         0         0
## 10     1972           0      676.         0          0         0         0
## # ℹ 48 more rows
## # ℹ 33 more variables: US...8 <dbl>, UK...9 <dbl>, `South Korea...10` <dbl>,
## #   Taiwan <dbl>, Uzbekistan...12 <dbl>, Belgium...13 <dbl>,
## #   Netherlands...14 <dbl>, Switzerland...15 <dbl>, Denmark...16 <dbl>,
## #   Morocco...17 <dbl>, `Saudi Arabia...18` <dbl>, Russia...19 <dbl>,
## #   Turkey...20 <dbl>, ...21 <lgl>, Year...22 <dbl>, Germany...23 <dbl>,
## #   Japan...24 <dbl>, China...25 <dbl>, France...26 <dbl>, Spain...27 <dbl>, …
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.4
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::%--%()       masks igraph::%--%()
## ✖ igraph::as_data_frame() masks dplyr::as_data_frame(), tibble::as_data_frame()
## ✖ purrr::compose()        masks igraph::compose()
## ✖ tidyr::crossing()       masks igraph::crossing()
## ✖ dplyr::filter()         masks stats::filter()
## ✖ lubridate::hms()        masks hms::hms()
## ✖ dplyr::lag()            masks stats::lag()
## ✖ ggplot2::layer()        masks latticeExtra::layer()
## ✖ purrr::map()            masks maps::map()
## ✖ car::recode()           masks dplyr::recode()
## ✖ MASS::select()          masks dplyr::select()
## ✖ purrr::simplify()       masks igraph::simplify()
## ✖ purrr::some()           masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
data_long1_1 <- data1[,1:20] %>% 
  pivot_longer(cols = -Year...1, names_to = "Country", values_to = "Kilometers")
data_long1_1
## # A tibble: 1,102 × 3
##    Year...1 Country          Kilometers
##       <dbl> <chr>                 <dbl>
##  1     1963 Germany...2               0
##  2     1963 Japan...3                 0
##  3     1963 China...4                 0
##  4     1963 France...5                0
##  5     1963 Spain...6                 0
##  6     1963 Italy...7                 0
##  7     1963 US...8                    0
##  8     1963 UK...9                    0
##  9     1963 South Korea...10          0
## 10     1963 Taiwan                    0
## # ℹ 1,092 more rows
library(tidyverse)

data_long1_2 <- data1 %>%
  pivot_longer(cols = -c(Year...1, Year...22:Turkey...40), names_to = "Country", values_to = "Kilometers")

print(data_long1_2)
## # A tibble: 1,160 × 22
##    Year...1 Year...22 Germany...23 Japan...24 China...25 France...26 Spain...27
##       <dbl>     <dbl>        <dbl>      <dbl>      <dbl>       <dbl>      <dbl>
##  1     1963      1963            0          0          0           0          0
##  2     1963      1963            0          0          0           0          0
##  3     1963      1963            0          0          0           0          0
##  4     1963      1963            0          0          0           0          0
##  5     1963      1963            0          0          0           0          0
##  6     1963      1963            0          0          0           0          0
##  7     1963      1963            0          0          0           0          0
##  8     1963      1963            0          0          0           0          0
##  9     1963      1963            0          0          0           0          0
## 10     1963      1963            0          0          0           0          0
## # ℹ 1,150 more rows
## # ℹ 15 more variables: Italy...28 <dbl>, US...29 <dbl>, UK...30 <dbl>,
## #   `South Korea...31` <dbl>, Uzbekistan...32 <dbl>, Belgium...33 <dbl>,
## #   Netherlands...34 <dbl>, Switzerland...35 <dbl>, Denmark...36 <dbl>,
## #   Morocco...37 <dbl>, `Saudi Arabia...38` <dbl>, Russia...39 <dbl>,
## #   Turkey...40 <dbl>, Country <chr>, Kilometers <dbl>
library(readr)

# Assuming you have a data frame named 'data_long' that you want to export

# Export the data frame to a CSV file in your home folder
write_csv(data_long1_1, "~/Downloads/Exit Project_Speed.csv")
library(tidyverse)
data_long2_1 <- data1[,22:40] %>% 
  pivot_longer(cols = -Year...22, names_to = "Country", values_to = "Kilometers")
data_long2_1
## # A tibble: 1,044 × 3
##    Year...22 Country          Kilometers
##        <dbl> <chr>                 <dbl>
##  1      1963 Germany...23              0
##  2      1963 Japan...24                0
##  3      1963 China...25                0
##  4      1963 France...26               0
##  5      1963 Spain...27                0
##  6      1963 Italy...28                0
##  7      1963 US...29                   0
##  8      1963 UK...30                   0
##  9      1963 South Korea...31          0
## 10      1963 Uzbekistan...32           0
## # ℹ 1,034 more rows
library(tidyverse)

data_long2_2 <- data1 %>%
  pivot_longer(cols = -c(Year...1:Turkey...20,Year...22), names_to = "Country", values_to = "Kilometers")

print(data_long2_2)
## # A tibble: 1,102 × 23
##    Year...1 Germany...2 Japan...3 China...4 France...5 Spain...6 Italy...7
##       <dbl>       <dbl>     <dbl>     <dbl>      <dbl>     <dbl>     <dbl>
##  1     1963           0         0         0          0         0         0
##  2     1963           0         0         0          0         0         0
##  3     1963           0         0         0          0         0         0
##  4     1963           0         0         0          0         0         0
##  5     1963           0         0         0          0         0         0
##  6     1963           0         0         0          0         0         0
##  7     1963           0         0         0          0         0         0
##  8     1963           0         0         0          0         0         0
##  9     1963           0         0         0          0         0         0
## 10     1963           0         0         0          0         0         0
## # ℹ 1,092 more rows
## # ℹ 16 more variables: US...8 <dbl>, UK...9 <dbl>, `South Korea...10` <dbl>,
## #   Taiwan <dbl>, Uzbekistan...12 <dbl>, Belgium...13 <dbl>,
## #   Netherlands...14 <dbl>, Switzerland...15 <dbl>, Denmark...16 <dbl>,
## #   Morocco...17 <dbl>, `Saudi Arabia...18` <dbl>, Russia...19 <dbl>,
## #   Turkey...20 <dbl>, Year...22 <dbl>, Country <chr>, Kilometers <dbl>
library(readr)

# Assuming you have a data frame named 'data_long' that you want to export

# Export the data frame to a CSV file in your home folder
write_csv(data_long2_1, "~/Downloads/Exit Project_Distance.csv")
library(readxl)
data_1 = read_excel('~/Home/Exit Project/Exit Project++.xlsx')
## New names:
## • `Cost per motorist` -> `Cost per motorist...54`
## • `Cost per motorist` -> `Cost per motorist...55`
## • `Cost per motorist` -> `Cost per motorist...57`
data_1
## # A tibble: 53 × 63
##    State             latitude longitude `Drove Alone` Carpooled `Public Transit`
##    <chr>                <dbl>     <dbl>         <dbl>     <dbl>            <dbl>
##  1 United States         NA        NA              NA        NA               NA
##  2 Alabama               32.8     -86.8       1776620    176668             7875
##  3 Alaska                61.4    -152.         244771     43680             6039
##  4 Arizona               33.7    -111.        2363506    336630            61372
##  5 Arkansas              35.0     -92.4       1066828    133919             5296
##  6 California            36.1    -120.       13411041   1841632           958126
##  7 Colorado              39.1    -105.        2165547    250417            89955
##  8 Connecticut           41.6     -72.8       1397871    141799            86407
##  9 Delaware              39.3     -75.5        364263     37721            11670
## 10 District of Colu…     38.9     -77.0        125308     19464           133343
## # ℹ 43 more rows
## # ℹ 57 more variables: `Public Transit Usage Percentage` <dbl>,
## #   Motorcycle <dbl>, Bicycle <dbl>, Walked <dbl>, Other <dbl>,
## #   `Worked at Home` <dbl>, Population <chr>, `2017 Population` <dbl>,
## #   `2018 Population` <dbl>, `2019 Population` <dbl>, `2020 Population` <dbl>,
## #   `2021 Population` <dbl>, `Square Miles` <chr>,
## #   `People Per Square Mile` <chr>, `Median Age` <chr>, …
data = na.omit(data_1)
data
## # A tibble: 43 × 63
##    State       latitude longitude `Drove Alone` Carpooled `Public Transit`
##    <chr>          <dbl>     <dbl>         <dbl>     <dbl>            <dbl>
##  1 Alabama         32.8     -86.8       1776620    176668             7875
##  2 Arizona         33.7    -111.        2363506    336630            61372
##  3 Arkansas        35.0     -92.4       1066828    133919             5296
##  4 California      36.1    -120.       13411041   1841632           958126
##  5 Colorado        39.1    -105.        2165547    250417            89955
##  6 Connecticut     41.6     -72.8       1397871    141799            86407
##  7 Delaware        39.3     -75.5        364263     37721            11670
##  8 Florida         27.8     -81.7       7420475    865300           191414
##  9 Georgia         33.0     -83.6       3787591    460494           114935
## 10 Idaho           44.2    -114.         613669     77467             6245
## # ℹ 33 more rows
## # ℹ 57 more variables: `Public Transit Usage Percentage` <dbl>,
## #   Motorcycle <dbl>, Bicycle <dbl>, Walked <dbl>, Other <dbl>,
## #   `Worked at Home` <dbl>, Population <chr>, `2017 Population` <dbl>,
## #   `2018 Population` <dbl>, `2019 Population` <dbl>, `2020 Population` <dbl>,
## #   `2021 Population` <dbl>, `Square Miles` <chr>,
## #   `People Per Square Mile` <chr>, `Median Age` <chr>, …
summary(data)
##     State              latitude       longitude        Drove Alone      
##  Length:43          Min.   :27.77   Min.   :-122.07   Min.   :  327871  
##  Class :character   1st Qu.:35.60   1st Qu.: -99.03   1st Qu.: 1075163  
##  Mode  :character   Median :39.06   Median : -88.99   Median : 1865513  
##                     Mean   :38.90   Mean   : -91.46   Mean   : 2635649  
##                     3rd Qu.:41.85   3rd Qu.: -80.38   3rd Qu.: 3492116  
##                     Max.   :47.53   Max.   : -69.38   Max.   :13411041  
##    Carpooled       Public Transit    Public Transit Usage Percentage
##  Min.   :  35254   Min.   :   2421   Min.   :0.00340                
##  1st Qu.: 130290   1st Qu.:   9265   1st Qu.:0.00805                
##  Median : 207945   Median :  41159   Median :0.01450                
##  Mean   : 311402   Mean   : 170907   Mean   :0.03195                
##  3rd Qu.: 393950   3rd Qu.: 110457   3rd Qu.:0.03470                
##  Max.   :1841632   Max.   :2660632   Max.   :0.28610                
##    Motorcycle       Bicycle           Walked           Other       
##  Min.   :  337   Min.   :   941   Min.   :  9535   Min.   :  2596  
##  1st Qu.: 1862   1st Qu.:  4117   1st Qu.: 25551   1st Qu.: 10488  
##  Median : 2944   Median :  9292   Median : 51588   Median : 21393  
##  Mean   : 5427   Mean   : 17742   Mean   : 86638   Mean   : 31486  
##  3rd Qu.: 5044   3rd Qu.: 20054   3rd Qu.: 93700   3rd Qu.: 40400  
##  Max.   :58407   Max.   :173081   Max.   :570614   Max.   :199011  
##  Worked at Home     Population        2017 Population    2018 Population   
##  Min.   :  17751   Length:43          Min.   :  756755   Min.   :  760062  
##  1st Qu.:  54074   Class :character   1st Qu.: 2981386   1st Qu.: 2997520  
##  Median : 124527   Mode  :character   Median : 5027102   Median : 5091702  
##  Mean   : 178322                      Mean   : 7279905   Mean   : 7319226  
##  3rd Qu.: 226448                      3rd Qu.: 8679579   3rd Qu.: 8701325  
##  Max.   :1073966                      Max.   :39337785   Max.   :39437463  
##  2019 Population    2020 Population    2021 Population    Square Miles      
##  Min.   :  763724   Min.   :  774948   Min.   :  778962   Length:43         
##  1st Qu.: 2999606   1st Qu.: 2987928   1st Qu.: 2984551   Class :character  
##  Median : 5157702   Median : 5190705   Median : 5130729   Mode  :character  
##  Mean   : 7353391   Mean   : 7433600   Mean   : 7424409                     
##  3rd Qu.: 8723950   3rd Qu.: 8954702   3rd Qu.: 8955894                     
##  Max.   :39437610   Max.   :39237836   Max.   :39499738                     
##  People Per Square Mile  Median Age        Under 18 Years        18 to 34      
##  Length:43              Length:43          Length:43          Min.   : 201384  
##  Class :character       Class :character   Class :character   1st Qu.: 680284  
##  Mode  :character       Mode  :character   Mode  :character   Median :1130567  
##                                                               Mean   :1694621  
##                                                               3rd Qu.:1947416  
##                                                               Max.   :9757672  
##     35 to 64         65 and Over      Percentage of 65 and Over
##  Min.   :  266706   Min.   : 112883   Min.   :0.04099          
##  1st Qu.: 1095520   1st Qu.: 453254   1st Qu.:0.09196          
##  Median : 1924179   Median : 804881   Median :0.12148          
##  Mean   : 2793714   Mean   :1135449   Mean   :0.13243          
##  3rd Qu.: 3464342   3rd Qu.:1339216   3rd Qu.:0.17313          
##  Max.   :15017638   Max.   :5486041   Max.   :0.23627          
##      Europe         Percentage of People Born in Europe      Asia        
##  Min.   :     0.2   Min.   :0.04099                     Min.   :   6669  
##  1st Qu.: 14984.0   1st Qu.:0.08994                     1st Qu.:  47272  
##  Median : 42969.0   Median :0.11512                     Median : 125761  
##  Mean   :107729.0   Mean   :0.13130                     Mean   : 307392  
##  3rd Qu.:114327.0   3rd Qu.:0.17313                     3rd Qu.: 334303  
##  Max.   :703049.0   Max.   :0.29790                     Max.   :4157181  
##  Percentage of People Born in Asia     Africa      
##  Min.   :0.1062                    Min.   :   872  
##  1st Qu.:0.2634                    1st Qu.:  9256  
##  Median :0.3107                    Median : 24147  
##  Mean   :0.3099                    Mean   : 51307  
##  3rd Qu.:0.3531                    3rd Qu.: 76468  
##  Max.   :0.5145                    Max.   :248298  
##  Percentage of People Born in Africa    Oceania       
##  Min.   :0.01767                     Min.   :  146.0  
##  1st Qu.:0.03858                     1st Qu.:  914.5  
##  Median :0.06072                     Median : 2819.0  
##  Mean   :0.07864                     Mean   : 5569.4  
##  3rd Qu.:0.10124                     3rd Qu.: 4038.0  
##  Max.   :0.27739                     Max.   :86276.0  
##  Percentage of People Born in Oceania Latin America    
##  Min.   :0.000000                     Min.   :   4310  
##  1st Qu.:0.003065                     1st Qu.:  74370  
##  Median :0.004642                     Median : 146452  
##  Mean   :0.006448                     Mean   : 512753  
##  3rd Qu.:0.007840                     3rd Qu.: 368805  
##  Max.   :0.040200                     Max.   :5302761  
##  Percentage of People Born in Latin America North America   
##  Min.   :0.1037                             Min.   :  1091  
##  1st Qu.:0.3625                             1st Qu.:  3304  
##  Median :0.4537                             Median :  8696  
##  Mean   :0.4407                             Mean   : 18243  
##  3rd Qu.:0.5270                             3rd Qu.: 17204  
##  Max.   :0.7587                             Max.   :130093  
##  Percentage of People Born in North America Less Than High School
##  Min.   :0.008213                           Min.   :0.0645       
##  1st Qu.:0.014713                           1st Qu.:0.0921       
##  Median :0.020802                           Median :0.1031       
##  Mean   :0.031630                           Mean   :0.1098       
##  3rd Qu.:0.029653                           3rd Qu.:0.1298       
##  Max.   :0.202300                           Max.   :0.1669       
##  High School Grad or Higher Bachelor's Degree or Higher Per Capita Income
##  Min.   :0.8331             Min.   :0.2061              Min.   :24369    
##  1st Qu.:0.8702             1st Qu.:0.2746              1st Qu.:29776    
##  Median :0.8969             Median :0.3126              Median :31619    
##  Mean   :0.8902             Mean   :0.3111              Mean   :33030    
##  3rd Qu.:0.9079             3rd Qu.:0.3411              3rd Qu.:36092    
##  Max.   :0.9355             Max.   :0.4369              Max.   :44496    
##  Median Household Income   Gini Index     Persons Below Poverty Level
##  Min.   : 5660           Min.   :0.4300   Min.   : 0.0919            
##  1st Qu.:53961           1st Qu.:0.4600   1st Qu.: 0.1128            
##  Median :59597           Median :0.4700   Median : 0.1336            
##  Mean   :60461           Mean   :0.4688   Mean   : 0.4370            
##  3rd Qu.:68386           3rd Qu.:0.4800   3rd Qu.: 0.1517            
##  Max.   :84805           Max.   :0.5100   Max.   :13.1300            
##  Average Commute to Work House Value Median Value Median Gross Rent
##  Length:43               Min.   :119000           Min.   : 725.0   
##  Class :character        1st Qu.:155350           1st Qu.: 828.0   
##  Mode  :character        Median :190400           Median : 907.0   
##                          Mean   :217377           Mean   : 987.7   
##                          3rd Qu.:270500           3rd Qu.:1120.0   
##                          Max.   :505000           Max.   :1503.0   
##  Difference between Per Capita Income and Median Gross Rent
##  Min.   : 4485                                             
##  1st Qu.:53073                                             
##  Median :58747                                             
##  Mean   :59474                                             
##  3rd Qu.:67182                                             
##  Max.   :83413                                             
##  Moved Since Previous Year  Speed Limit    % non-acceptable roads
##  Min.   :0.1024            Min.   :65.00   Length:43             
##  1st Qu.:0.1298            1st Qu.:70.00   Class :character      
##  Median :0.1473            Median :70.00   Mode  :character      
##  Mean   :0.1461            Mean   :71.86                         
##  3rd Qu.:0.1587            3rd Qu.:75.00                         
##  Max.   :0.1796            Max.   :75.00                         
##  % poor bridge deck (sq. miles area) Cost per motorist...54
##  Length:43                           Length:43             
##  Class :character                    Class :character      
##  Mode  :character                    Mode  :character      
##                                                            
##                                                            
##                                                            
##  Cost per motorist...55 % of spending on road repair Cost per motorist...57
##  Min.   :194.0          Length:43                    Length:43             
##  1st Qu.:469.0          Class :character             Class :character      
##  Median :544.0          Mode  :character             Mode  :character      
##  Mean   :557.5                                                             
##  3rd Qu.:660.5                                                             
##  Max.   :900.0                                                             
##  Registered Automobiles Public Roadway Miles Congestion Index    Rail Miles   
##  Min.   :  864762       Min.   :  6027       Min.   :  9.998   Min.   :   58  
##  1st Qu.: 2195536       1st Qu.: 55160       1st Qu.: 38.856   1st Qu.: 1850  
##  Median : 4148721       Median : 87088       Median : 57.509   Median : 2851  
##  Mean   : 5783891       Mean   : 90658       Mean   : 68.635   Mean   : 2960  
##  3rd Qu.: 7017098       3rd Qu.:117423       3rd Qu.: 86.352   3rd Qu.: 3752  
##  Max.   :28686646       Max.   :313596       Max.   :176.074   Max.   :10506  
##  Transit Passenger Trips Amtrak Ridership  
##  Min.   :2.700e+06       Min.   :    6726  
##  1st Qu.:8.600e+06       1st Qu.:   79691  
##  Median :4.160e+07       Median :  176159  
##  Mean   :2.037e+08       Mean   : 1317485  
##  3rd Qu.:1.282e+08       3rd Qu.:  959590  
##  Max.   :3.900e+09       Max.   :13023167
nrow(data)
## [1] 43
ncol(data)
## [1] 63
head(data$`Rail Miles`)
## [1] 3272 1820 2505 4828 2427  522
data$`Square Miles` = as.numeric(data$`Square Miles`,is.na=FALSE)
data$`People Per Square Mile` = as.numeric(data$`People Per Square Mile`,is.na=FALSE)
data$`Median Age` = as.numeric(data$`Median Age`,is.na=FALSE)
data$`Average Commute to Work` = as.numeric(data$`Average Commute to Work`,is.na=FALSE)
data$`% non-acceptable roads` = as.numeric(data$`% non-acceptable roads`,is.na=FALSE)
data$`% poor bridge deck (sq. miles area)`= as.numeric(data$`% poor bridge deck (sq. miles area)`,is.na=FALSE)
data$`% of spending on road repair` = as.numeric(data$`% of spending on road repair`,is.na=FALSE)
data$`Cost per motorist...54` = as.numeric(data$`Cost per motorist...54`,is.na=FALSE)
data$`Cost per motorist...57` = as.numeric(data$`Cost per motorist...57`,is.na=FALSE)
#sapply(data, class) 
par(mfrow=c(2,2),cex=1)
a1 = boxplot(data$"Per Capita Income",
        ylab = "Per Capita Income", is.na=FALSE)
b1 = boxplot(data$"Average Commute to Work",
        ylab = "Average Commute to Work", is.na=FALSE) 
a2 = plot(`Public Transit Usage Percentage` ~ `Per Capita Income`, data=data)
b2 = plot(`Public Transit Usage Percentage` ~ `Average Commute to Work`, data=data)

set.seed(1)
sampleSize <- round(nrow(data)*0.8)
idx <- sample(seq_len(sampleSize), size = sampleSize)
X.train <- data[idx,]
X.test <- data[-idx,]
X.train
## # A tibble: 34 × 63
##    State         latitude longitude `Drove Alone` Carpooled `Public Transit`
##    <chr>            <dbl>     <dbl>         <dbl>     <dbl>            <dbl>
##  1 California        36.1    -120.       13411041   1841632           958126
##  2 Alabama           32.8     -86.8       1776620    176668             7875
##  3 Arizona           33.7    -111.        2363506    336630            61372
##  4 New York          42.2     -74.9       4931345    601046          2660632
##  5 Missouri          38.5     -92.3       2364888    249373            41159
##  6 Illinois          40.3     -89.0       4487622    484736           599410
##  7 Kansas            38.5     -96.7       1181623    128744             7794
##  8 Maryland          39.1     -76.8       2251348    271403           264311
##  9 Massachusetts     42.2     -71.5       3850705    436089            53625
## 10 Oklahoma          35.6     -96.9       1451796    173011             8597
## # ℹ 24 more rows
## # ℹ 57 more variables: `Public Transit Usage Percentage` <dbl>,
## #   Motorcycle <dbl>, Bicycle <dbl>, Walked <dbl>, Other <dbl>,
## #   `Worked at Home` <dbl>, Population <chr>, `2017 Population` <dbl>,
## #   `2018 Population` <dbl>, `2019 Population` <dbl>, `2020 Population` <dbl>,
## #   `2021 Population` <dbl>, `Square Miles` <dbl>,
## #   `People Per Square Mile` <dbl>, `Median Age` <dbl>, …
X.test
## # A tibble: 9 × 63
##   State          latitude longitude `Drove Alone` Carpooled `Public Transit`
##   <chr>             <dbl>     <dbl>         <dbl>     <dbl>            <dbl>
## 1 Pennsylvania       40.6     -77.2       4617285    514206           348949
## 2 Rhode Island       41.7     -71.5        419037     45101            13911
## 3 South Carolina     33.9     -80.9       1865513    207945            14767
## 4 Tennessee          35.7     -86.7       2548060    273665            23353
## 5 Texas              31.1     -97.6      10560476   1308229           192240
## 6 Utah               40.2    -112.        1124679    159967            36802
## 7 Virginia           37.8     -78.2       3230271    384172           194554
## 8 Washington         47.4    -121.        2555264    355403           239511
## 9 West Virginia      38.5     -81.0        599371     65081             6793
## # ℹ 57 more variables: `Public Transit Usage Percentage` <dbl>,
## #   Motorcycle <dbl>, Bicycle <dbl>, Walked <dbl>, Other <dbl>,
## #   `Worked at Home` <dbl>, Population <chr>, `2017 Population` <dbl>,
## #   `2018 Population` <dbl>, `2019 Population` <dbl>, `2020 Population` <dbl>,
## #   `2021 Population` <dbl>, `Square Miles` <dbl>,
## #   `People Per Square Mile` <dbl>, `Median Age` <dbl>, `Under 18 Years` <chr>,
## #   `18 to 34` <dbl>, `35 to 64` <dbl>, `65 and Over` <dbl>, …
library(pls)
## 
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
## 
##     loadings
PCR.fit <- pcr(`Public Transit Usage Percentage` ~ `Per Capita Income` + `Average Commute to Work`+ `Rail Miles` + `Amtrak Ridership`, data = X.train,scale = TRUE, validation = "CV")
summary(PCR.fit)
## Data:    X dimension: 28 4 
##  Y dimension: 28 1
## Fit method: svdpc
## Number of components considered: 4
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps
## CV         0.05868  0.04845  0.05069  0.05133  0.06049
## adjCV      0.05868  0.04779  0.04981  0.05041  0.05909
## 
## TRAINING: % variance explained
##                                  1 comps  2 comps  3 comps  4 comps
## X                                  52.78    82.29    93.26   100.00
## Public Transit Usage Percentage    57.15    64.50    64.61    66.19
validationplot(PCR.fit, val.type = "MSEP", legendpos = "topright")

PCR.pred_test <- predict(PCR.fit, X.test, ncomp = 2)

Put one of your pre-existing projects on GitHub

In this assignment, you will bring one of your pre-existing projects into the Git/GitHub and RStudio universe so you can start adopting version control and dynamic report generation (RMarkdown) in your own work right away. Below, we’ll guide you through the setup step by step. There are other, perhaps more elegant, ways to add version control to a pre-existing project, but we will use the approach described below because it works well and lets us work through RStudio so we won’t need to use command line to interact with Git. If you don’t have a project that you feel is suitable for GitHub version control, you can set up a hypothetical research project repo instead and create a few hypothetical script and data files.

This assignment is due by 10pm on Thursday 2/10/2021.

To submit, paste the URL to the new project GitHub repo you will create here to the README file in your course repo. Below the table used to record grades, please type

Assignment 2: URL-to-repo [replacing URL-to-repo with the actual URL to your repo].


Acknowledgements

These instructions are adapted from three sources:


Task 1: Select a project to put on GitHub

Decide which of your current projects you want to create a GitHub repo for. GitHub works best for tracking plain text files, so consider choosing a project for which you have some scripts and data files. If you don’t have any projects like that yet, you can think up a fictional project.



Task 2: Make a repo on GitHub

Decide what you want to call the GitHub repository that will host your project. Select a short and descriptive name without spaces (you can use “-” or “_” in place of spaces if you want a multi-word name). Create a new repository on GitHub with that name. When setting up your repo, select the following features:

  • Make the repo Private

  • Add a README.md

  • Just leave the Add .gitignore and “Choose license” boxes unchecked.



Task 3: Invite Nicolas as a collaborator to allow grading

Your repo is now ready. Since you made it private, only you can see it, so don’t worry about putting data and results up there. It will only be visible to people you explicitly invite as collaborators.

To show us that you have completed the assignment, you will therefore need to add Nicolas as a collaborator temporarily. Once he has had a chance to review your setup, you can revoke his collaborator privileges so he won’t have continued access.

To give Nicolas access:

  • Click on the “Settings” tab

  • Click on “Manage access” from the menu on the left

  • Click on the green button labeled Invite a collaborator (note that you may be prompted to enter your GitHub password)

  • In the search box that appears, type nicolas931010 and select Nicolas’ name/username

  • Click on the green button that says Add nicolas931010 to this repository



Task 4: Clone the repo by creating an associated RStudio project

Create a new project in RStudio with a connection to the GitHub repo you just created. Remember that we strongly recommend that you put this project folder in a location not under some other form of version control (e.g. Box, Dropbox, or Google Docs). A great location is to create your project as a subfolder of a directory called github that lives in your root (“home”) directory (i.e. ~/github/)



Task 5: Organize your project folder and copy your existing project files over

On your local computer (i.e. not on the GitHub web interface), create a series of sub-directories that will help you organize your files in the folder you cloned your repo into (i.e. where your .Rproj and README.md live). Different projects will have different types of files, but a good place to start would be creating the following sub-directories:

  • raw-data
  • scripts
  • processed-data
  • results
  • figures
  • rmarkdowns

You can make these sub-directories either by clicking “New Folder” in the “Files” tab in RStudio, or with your computer’s file browser, like Finder on a Mac):

Now, using your favorite method of moving or copying files, copy all the files that constitute your existing project (your code and data etc.) into the appropriate sub-directory for this new project (keep your original copy for now to not disrupt any dependencies of local paths in your current workflow).

Here’s one suggestion for organizing the content of sub-directories, but feel free to adapt to your particular needs:

  • raw-data: external data that you haven’t edited, like original datasets you collected, got from collaborators, or downloaded. Sometimes these have long file names or are super large or are otherwise gnarly; that’s OK.
  • scripts: all of your scripts for data analysis or processing
  • processed-data: data produced by R scripts, e.g., cleaned, tidied, or summarized for analysis (this may be empty right now, that’s OK!)
  • results: output results from your analysis, like model summary files, tables etc (this may be empty right now, that’s OK!)
  • figures: images generated by R scripts that create plots (this may be empty right now, that’s OK!)
  • rmarkdown: RMarkdown files (e.g. reports or analysis logs) that help you keep track of your analysis and outputs

Why are we doing this? A well-organized repository will make it much easier for you to keep track of your data, analysis, and results, and will also facilitate version control.



Task 6: Decide what files to push to GitHub

There are no absolute rules about what types of files should be sync’ed on GitHub and which should not. If you only have code and relatively small data files and outputs like plots and small image files (<100 Mb) in your repo, you should just go ahead and push it all to GitHub.

If you have large files, or files that Git can’t version control (e.g. binary files like Word documents), you may consider just keeping these files local and not pushing them to GitHub. You can do this by adding them to your .gitignore file.

Find guidance on what files to push to GitHub, see Section 10 of Jenny Bryan’s Excuse me, do you have a moment to talk about version control?.

One important thing to note is that GitHub has a file size limit of 100 Mb - you will not be able to push if your commit includes files larger than this size. GitHub also recommends that you ideally keep the total size of your repo smaller than 1 Gb and definitely less than 5 Gb. So find a different way to back up really large files and use GitHub primarily for text-based or figure files (most of which are likely much smaller than 100 Mb).



Task 7: Push your local changes to GitHub

In RStudio, consult the Git pane and the file browser.

  • Are you seeing all the files? They should be here if your copy was successful.
  • Are they showing up in the Git pane with questions marks? They should be appearing as new untracked files.

Stage, commit and push your changes to GitHub.



Task 8: Confirm the local change propagated to the GitHub remote

Go back to the browser and refresh the page with your new GitHub repo.

Refresh.

You should see all the project files you committed there.

If you click on “Commits,” you should see one with the message “init”.



Assignment 3

Create a Website using R Markdown and Github Pages


Instructions: Please read through before you begin

This assignment is due by 10pm on Thursday 02/17/21.

Pair up with the classmate you worked with during lecture 5 and exchange usernames of your personal Github accounts. If you missed lecture 5 or for other reasons need a partner, let us know. You will have to coordinate to do the first part of this exercise together; the second part you can each do individually (but you’re also welcome to collaborate!)

Each of you should complete the following steps:

  • Create a new public repository in your personal Github account, name it assignment-3-NetID (make sure to change to your NetID, Nina’s repo will be assignment-3-nt246), and initialize with a README file.

  • Navigate to the settings for your new Github repo, in the Options tab, scroll down to the Github Pages section and enable it by selecting the main branch of your repo as the source. Make sure the root folder is selected and hit save.

  • Clone this repository to your local computer through RStudio.

  • Create TWO RMarkdown files, choosing HTML as the output option. These files will be used to generate the pages of your website. The file that will render the landing page of your website must be named index.Rmd. Name the second file after yourself (first name is fine, unless you and your partner have the same first name, in which case make sure to name your files differently).

    index.Rmd

    • This is the landing page of your website, make sure it includes the following:
      • A title
      • Your name and the name of your collaborator with links to your personal Github accounts
      • An image with a caption

    nina.Rmd (replace my name with yours)

    • This page should include the following:
      • Your first name as the title
      • An unordered list of some information about you, including at least 3 bullet points
  • Save and knit your two new files. In your git pane, you should see the following:

    • assignment-3-nt246.Rproj
    • .gitignore
    • index.Rmd
    • index.html
    • nina.Rmd
    • nina.html
  • Sync back to Github, remembering to follow these steps:

  • Check your repo on Github to make sure it synced properly.

  • Navigate to your repo Settings on Github, select the Collaborators option in the left side panel and invite your collaborator.

  • Check your email (the one linked to your personal Github account) to accept an invitation from your collaborator.

  • Navigate to your partner’s repo on Github and clone their repository to your local computer through RStudio, just like you did your own. Once you have cloned their repo, copy over your <name>.Rmd file from your repo to theirs. You can do this by using the file finder on your computer: navigate to the local folder of your repo in Finder or Windows File Explorer, copy the file and paste it to the local folder of your partner’s repo. Once the file shows up in RStudio (make sure you are in the R project that corresponds to your collaborator’s repo), sync it back to Github.

  • Open your repo as a project in RStudio, PULL so that you are synced with your remote repo - your collaborator’s Rmarkdown file should now be in your local repo. Open it up and knit the file so that you can preview the HTML output. After this step, you can work independently from your partner to build your website (but feel free to work together for the rest of the assignment).

  • Create a new TEXT file that will contain metadata for the website and save it as _site.yml. The _site.yml file should include the following information (update the file names to correspond to your files):

output_dir: "."
navbar:
  left:
    - text: "Home"
      href: index.html
    - text: "Nina"
      href: nina.html
    - text: "Nicolas"
      href: nicolas.html  


  • In the RStudio console, run rmarkdown::render_site().

  • Sync back to Github.

  • Open a browser and type the link for your website, for example, “https://nt246.github.io/assignment-3-nt246” (but with your GitHub username and repository name instead).


This minimal setup satisfies the requirements for the assignment. However, we strongly encourage you to update the appearance of your website by changing the output theme (reviewed in Lesson 5). You may also include more content on your page(s), add more pages, add and customize a table of contents to your page(s), and/or add drop-down menus in your navigation bar. NOTE: You are working independently at this point, so each of you can customize your website however you like.


To submit, paste the URL to your website to the README file in your course repo. Below the table used to record grades, please type

Assignment 3: URL-to-website [replacing URL-to-website with the actual URL to your published website].

Below the URL, please indicate who you were collaborating with.

We will look at your website, and from its URL, we can find the associated repo and examine your commit history to confirm that your partner successfully pushed a page to your site.

Please make sure to have your website ready and have posted the URL by 10pm on Thursday 02/17/21.


Results

Plots

We show a scatter plot in this section.

par(mar = c(4, 4, .5, .1))
plot(mpg ~ hp, data = mtcars, pch = 19)

Tables

We show the data in this tab.

head(mtcars)
##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1