packages

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(quantmod)

## Loading required package: xts

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################

## 
## Attaching package: 'xts'

## The following objects are masked from 'package:dplyr':
## 
##     first, last

## Loading required package: TTR

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

library(robotstxt)
library(rvest)
library(VIM)

## Loading required package: colorspace

## Loading required package: grid

## VIM is ready to use.

## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues

## 
## Attaching package: 'VIM'

## The following object is masked from 'package:datasets':
## 
##     sleep

library(nycflights13)

Example slides 1.1

x_vec <- c(1,2,3,4)
x_vec > 2

## [1] FALSE FALSE  TRUE  TRUE

sum(x_vec>2)

## [1] 2

Assignment 1 Review

bike1 <- read.csv("Datasets/bike_sharing_data.csv")

# obs/variables

glimpse(bike1)

## Rows: 17,379
## Columns: 13
## $ datetime   <chr> "1/1/2011 0:00", "1/1/2011 1:00", "1/1/2011 2:00", "1/1/201…
## $ season     <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ holiday    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ workingday <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ weather    <int> 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3,…
## $ temp       <dbl> 9.84, 9.02, 9.02, 9.84, 9.84, 9.84, 9.02, 8.20, 9.84, 13.12…
## $ atemp      <dbl> 14.395, 13.635, 13.635, 14.395, 14.395, 12.880, 13.635, 12.…
## $ humidity   <chr> "81", "80", "80", "75", "75", "75", "80", "86", "75", "76",…
## $ windspeed  <dbl> 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 6.0032, 0.0000, 0.0…
## $ casual     <int> 3, 8, 5, 3, 0, 0, 2, 1, 1, 8, 12, 26, 29, 47, 35, 40, 41, 1…
## $ registered <int> 13, 32, 27, 10, 1, 1, 0, 2, 7, 6, 24, 30, 55, 47, 71, 70, 5…
## $ count      <int> 16, 40, 32, 13, 1, 1, 2, 3, 8, 14, 36, 56, 84, 94, 106, 110…
## $ sources    <chr> "ad campaign", "www.yahoo.com", "www.google.fi", "AD campai…

what data type is humidity

str(bike1)

## 'data.frame':    17379 obs. of  13 variables:
##  $ datetime  : chr  "1/1/2011 0:00" "1/1/2011 1:00" "1/1/2011 2:00" "1/1/2011 3:00" ...
##  $ season    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ holiday   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ workingday: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weather   : int  1 1 1 1 1 2 1 1 1 1 ...
##  $ temp      : num  9.84 9.02 9.02 9.84 9.84 ...
##  $ atemp     : num  14.4 13.6 13.6 14.4 14.4 ...
##  $ humidity  : chr  "81" "80" "80" "75" ...
##  $ windspeed : num  0 0 0 0 0 ...
##  $ casual    : int  3 8 5 3 0 0 2 1 1 8 ...
##  $ registered: int  13 32 27 10 1 1 0 2 7 6 ...
##  $ count     : int  16 40 32 13 1 1 2 3 8 14 ...
##  $ sources   : chr  "ad campaign" "www.yahoo.com" "www.google.fi" "AD campaign" ...

value of season in row 6251

bike1$season[6251]

## [1] 4

how many obs are winter

dim(subset(bike1,season==4))

## [1] 4232   13

# obs high wind during winter or spring

dim(subset(bike1, windspeed >= 40 & season %in% c(1,4)))

## [1] 46 13

Assignment 2 Review

get microsoft daily stock symbols

getSymbols("MSFT", src="yahoo",from = "2024-10-01", to = "2025-02-01")

## [1] "MSFT"

# of days

dim(MSFT)

## [1] 84  6

when was stock price highest

plot(MSFT$MSFT.Open)

Assignment 3 Review

is scraping allowed

paths_allowed("https://en.wikipedia.org/wiki/2024_in_film")

##  en.wikipedia.org

## [1] TRUE

scrape table data

link <- "https://en.wikipedia.org/wiki/2024_in_film"
page <- read_html(link)
tbls <- html_elements(page, "table")
highest_gross <- html_table(tbls[[2]])
highest_gross

## # A tibble: 10 × 4
##     Rank Title                           Distributor  `Worldwide gross`
##    <int> <chr>                           <chr>        <chr>            
##  1     1 Inside Out 2                    Disney       $1,698,863,816   
##  2     2 Deadpool & Wolverine            Disney       $1,338,073,645   
##  3     3 Moana 2                         Disney       $1,059,242,164   
##  4     4 Despicable Me 4                 Universal    $971,315,095     
##  5     5 Wicked                          Universal    $756,535,681     
##  6     6 Mufasa: The Lion King           Disney       $723,060,982     
##  7     7 Dune: Part Two                  Warner Bros. $715,409,065     
##  8     8 Godzilla x Kong: The New Empire Warner Bros. $572,505,338     
##  9     9 Kung Fu Panda 4                 Universal    $547,689,492     
## 10    10 Sonic the Hedgehog 3            Paramount    $492,162,604

Assignment 4

# missing values in Petal.Length

dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")

sum(is.na(dirty_iris$Petal.Length))

## [1] 19

# and % of complete observations

num_complete <- sum(complete.cases(dirty_iris))
num_complete

## [1] 96

nrow(na.omit(dirty_iris))

## [1] 96

pct_complete <- num_complete / nrow(dirty_iris) * 100
pct_complete

## [1] 64

what type of special values in numeric columns

summary(dirty_iris)

##   Sepal.Length     Sepal.Width      Petal.Length    Petal.Width 
##  Min.   : 0.000   Min.   :-3.000   Min.   : 0.00   Min.   :0.1  
##  1st Qu.: 5.100   1st Qu.: 2.800   1st Qu.: 1.60   1st Qu.:0.3  
##  Median : 5.750   Median : 3.000   Median : 4.50   Median :1.3  
##  Mean   : 6.559   Mean   : 3.391   Mean   : 4.45   Mean   :Inf  
##  3rd Qu.: 6.400   3rd Qu.: 3.300   3rd Qu.: 5.10   3rd Qu.:1.8  
##  Max.   :73.000   Max.   :30.000   Max.   :63.00   Max.   :Inf  
##  NA's   :10       NA's   :17       NA's   :19      NA's   :12   
##    Species         
##  Length:150        
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
##

locate inf and replace with placeholder

inf_indicator <- is.infinite(dirty_iris$Petal.Width)
dirty_iris$Petal.Width[inf_indicator] <- NA
summary(dirty_iris)

##   Sepal.Length     Sepal.Width      Petal.Length    Petal.Width   
##  Min.   : 0.000   Min.   :-3.000   Min.   : 0.00   Min.   :0.100  
##  1st Qu.: 5.100   1st Qu.: 2.800   1st Qu.: 1.60   1st Qu.:0.300  
##  Median : 5.750   Median : 3.000   Median : 4.50   Median :1.300  
##  Mean   : 6.559   Mean   : 3.391   Mean   : 4.45   Mean   :1.207  
##  3rd Qu.: 6.400   3rd Qu.: 3.300   3rd Qu.: 5.10   3rd Qu.:1.800  
##  Max.   :73.000   Max.   :30.000   Max.   :63.00   Max.   :2.500  
##  NA's   :10       NA's   :17       NA's   :19      NA's   :13     
##    Species         
##  Length:150        
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
##

find obs that violate rules

rules_violate <- subset(dirty_iris,c((Sepal.Width<=0)|(Sepal.Length>30)))
nrow(rules_violate)

## [1] 4

locate obs that violate and correct

neg_indicator <- which(dirty_iris$Sepal.Width<0)
dirty_iris$Sepal.Width[neg_indicator]<-abs(dirty_iris$Sepal.Width[neg_indicator])
dirty_iris[neg_indicator,]

##    Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 16            5           3          3.5           1 versicolor

zero_indicator <- which(dirty_iris$Sepal.Width == 0)
dirty_iris$Sepal.Width[zero_indicator] <- NA

Sepal.Width mean

iris_mean <- mean(dirty_iris$Sepal.Width, na.rm=TRUE) 
iris_mean

## [1] 3.462121

Petal.Length median

iris_median<- median(dirty_iris$Petal.Length, na.rm=TRUE)
iris_median

## [1] 4.5

Sepal.Length linear regression

model <- lm(Sepal.Length~Sepal.Width+Petal.Width, data=dirty_iris)
I <- is.na(dirty_iris$Sepal.Length)
to_be_imputed <- dirty_iris[I,]
dirty_iris$Sepal.Length[I] <- predict(model, newdata = to_be_imputed)

Petal.Width kNN

iris_nearest <- kNN(dirty_iris)

## Sepal.Length Petal.Length  Petal.Width Sepal.Length Petal.Length  Petal.Width 
##          0.0          0.0          0.1         73.0         63.0          2.5 
## Sepal.Length  Sepal.Width  Petal.Width Sepal.Length  Sepal.Width  Petal.Width 
##          0.0          2.2          0.1         73.0         30.0          2.5 
## Sepal.Length  Sepal.Width Petal.Length Sepal.Length  Sepal.Width Petal.Length 
##          0.0          2.2          0.0         73.0         30.0         63.0

#iris_nearest

Base R approach

extract from mtcars

data("mtcars")
mtcars[ ,c("mpg", "cyl")]

##                      mpg cyl
## Mazda RX4           21.0   6
## Mazda RX4 Wag       21.0   6
## Datsun 710          22.8   4
## Hornet 4 Drive      21.4   6
## Hornet Sportabout   18.7   8
## Valiant             18.1   6
## Duster 360          14.3   8
## Merc 240D           24.4   4
## Merc 230            22.8   4
## Merc 280            19.2   6
## Merc 280C           17.8   6
## Merc 450SE          16.4   8
## Merc 450SL          17.3   8
## Merc 450SLC         15.2   8
## Cadillac Fleetwood  10.4   8
## Lincoln Continental 10.4   8
## Chrysler Imperial   14.7   8
## Fiat 128            32.4   4
## Honda Civic         30.4   4
## Toyota Corolla      33.9   4
## Toyota Corona       21.5   4
## Dodge Challenger    15.5   8
## AMC Javelin         15.2   8
## Camaro Z28          13.3   8
## Pontiac Firebird    19.2   8
## Fiat X1-9           27.3   4
## Porsche 914-2       26.0   4
## Lotus Europa        30.4   4
## Ford Pantera L      15.8   8
## Ferrari Dino        19.7   6
## Maserati Bora       15.0   8
## Volvo 142E          21.4   4

extract columns w string s

mtcars[grep("s", names(mtcars))]

##                      disp  qsec vs
## Mazda RX4           160.0 16.46  0
## Mazda RX4 Wag       160.0 17.02  0
## Datsun 710          108.0 18.61  1
## Hornet 4 Drive      258.0 19.44  1
## Hornet Sportabout   360.0 17.02  0
## Valiant             225.0 20.22  1
## Duster 360          360.0 15.84  0
## Merc 240D           146.7 20.00  1
## Merc 230            140.8 22.90  1
## Merc 280            167.6 18.30  1
## Merc 280C           167.6 18.90  1
## Merc 450SE          275.8 17.40  0
## Merc 450SL          275.8 17.60  0
## Merc 450SLC         275.8 18.00  0
## Cadillac Fleetwood  472.0 17.98  0
## Lincoln Continental 460.0 17.82  0
## Chrysler Imperial   440.0 17.42  0
## Fiat 128             78.7 19.47  1
## Honda Civic          75.7 18.52  1
## Toyota Corolla       71.1 19.90  1
## Toyota Corona       120.1 20.01  1
## Dodge Challenger    318.0 16.87  0
## AMC Javelin         304.0 17.30  0
## Camaro Z28          350.0 15.41  0
## Pontiac Firebird    400.0 17.05  0
## Fiat X1-9            79.0 18.90  1
## Porsche 914-2       120.3 16.70  0
## Lotus Europa         95.1 16.90  1
## Ford Pantera L      351.0 14.50  0
## Ferrari Dino        145.0 15.50  0
## Maserati Bora       301.0 14.60  0
## Volvo 142E          121.0 18.60  1

add new column hpCyl

mtcars$hpCyl <- mtcars$hp + mtcars$cyl

extract rows on rule

subset(mtcars, (cyl<=6) & (hp>500))

##  [1] mpg   cyl   disp  hp    drat  wt    qsec  vs    am    gear  carb  hpCyl
## <0 rows> (or 0-length row.names)

summarize data in groups

cyl4 <- mtcars[which(mtcars$cyl==4),]
mean(cyl4$disp)

## [1] 105.1364

max(cyl4$hp)

## [1] 113

dplyr approach

extract from mtcars

select(mtcars, mpg, cyl)

##                      mpg cyl
## Mazda RX4           21.0   6
## Mazda RX4 Wag       21.0   6
## Datsun 710          22.8   4
## Hornet 4 Drive      21.4   6
## Hornet Sportabout   18.7   8
## Valiant             18.1   6
## Duster 360          14.3   8
## Merc 240D           24.4   4
## Merc 230            22.8   4
## Merc 280            19.2   6
## Merc 280C           17.8   6
## Merc 450SE          16.4   8
## Merc 450SL          17.3   8
## Merc 450SLC         15.2   8
## Cadillac Fleetwood  10.4   8
## Lincoln Continental 10.4   8
## Chrysler Imperial   14.7   8
## Fiat 128            32.4   4
## Honda Civic         30.4   4
## Toyota Corolla      33.9   4
## Toyota Corona       21.5   4
## Dodge Challenger    15.5   8
## AMC Javelin         15.2   8
## Camaro Z28          13.3   8
## Pontiac Firebird    19.2   8
## Fiat X1-9           27.3   4
## Porsche 914-2       26.0   4
## Lotus Europa        30.4   4
## Ford Pantera L      15.8   8
## Ferrari Dino        19.7   6
## Maserati Bora       15.0   8
## Volvo 142E          21.4   4

extract columns w string s

mtcars %>% 
  select(contains("s"))

##                      disp  qsec vs
## Mazda RX4           160.0 16.46  0
## Mazda RX4 Wag       160.0 17.02  0
## Datsun 710          108.0 18.61  1
## Hornet 4 Drive      258.0 19.44  1
## Hornet Sportabout   360.0 17.02  0
## Valiant             225.0 20.22  1
## Duster 360          360.0 15.84  0
## Merc 240D           146.7 20.00  1
## Merc 230            140.8 22.90  1
## Merc 280            167.6 18.30  1
## Merc 280C           167.6 18.90  1
## Merc 450SE          275.8 17.40  0
## Merc 450SL          275.8 17.60  0
## Merc 450SLC         275.8 18.00  0
## Cadillac Fleetwood  472.0 17.98  0
## Lincoln Continental 460.0 17.82  0
## Chrysler Imperial   440.0 17.42  0
## Fiat 128             78.7 19.47  1
## Honda Civic          75.7 18.52  1
## Toyota Corolla       71.1 19.90  1
## Toyota Corona       120.1 20.01  1
## Dodge Challenger    318.0 16.87  0
## AMC Javelin         304.0 17.30  0
## Camaro Z28          350.0 15.41  0
## Pontiac Firebird    400.0 17.05  0
## Fiat X1-9            79.0 18.90  1
## Porsche 914-2       120.3 16.70  0
## Lotus Europa         95.1 16.90  1
## Ford Pantera L      351.0 14.50  0
## Ferrari Dino        145.0 15.50  0
## Maserati Bora       301.0 14.60  0
## Volvo 142E          121.0 18.60  1

add new column hpCyl

mtcars %>%
  mutate(hpCyl = hp + cyl)

##                      mpg cyl  disp  hp drat    wt  qsec vs am gear carb hpCyl
## Mazda RX4           21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4   116
## Mazda RX4 Wag       21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4   116
## Datsun 710          22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1    97
## Hornet 4 Drive      21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1   116
## Hornet Sportabout   18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2   183
## Valiant             18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1   111
## Duster 360          14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4   253
## Merc 240D           24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2    66
## Merc 230            22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2    99
## Merc 280            19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4   129
## Merc 280C           17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4   129
## Merc 450SE          16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3   188
## Merc 450SL          17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3   188
## Merc 450SLC         15.2   8 275.8 180 3.07 3.780 18.00  0  0    3    3   188
## Cadillac Fleetwood  10.4   8 472.0 205 2.93 5.250 17.98  0  0    3    4   213
## Lincoln Continental 10.4   8 460.0 215 3.00 5.424 17.82  0  0    3    4   223
## Chrysler Imperial   14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4   238
## Fiat 128            32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1    70
## Honda Civic         30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2    56
## Toyota Corolla      33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1    69
## Toyota Corona       21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1   101
## Dodge Challenger    15.5   8 318.0 150 2.76 3.520 16.87  0  0    3    2   158
## AMC Javelin         15.2   8 304.0 150 3.15 3.435 17.30  0  0    3    2   158
## Camaro Z28          13.3   8 350.0 245 3.73 3.840 15.41  0  0    3    4   253
## Pontiac Firebird    19.2   8 400.0 175 3.08 3.845 17.05  0  0    3    2   183
## Fiat X1-9           27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1    70
## Porsche 914-2       26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2    95
## Lotus Europa        30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2   117
## Ford Pantera L      15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4   272
## Ferrari Dino        19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6   181
## Maserati Bora       15.0   8 301.0 335 3.54 3.570 14.60  0  1    5    8   343
## Volvo 142E          21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2   113

extract rows on rule

filter(mtcars, cyl <= 6, hp>500)

##  [1] mpg   cyl   disp  hp    drat  wt    qsec  vs    am    gear  carb  hpCyl
## <0 rows> (or 0-length row.names)

summarize data in groups

stats_cyl <- mtcars %>%
  group_by(cyl) %>%
  summarize(mean(disp), max(hp))
#stats_cyl

Assignment 5

check code

not_cancelled <- flights %>%
 filter(!is.na(dep_delay))

not_cancelled %>% 
 group_by(year, month, day)

## # A tibble: 328,521 × 19
## # Groups:   year, month, day [365]
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ℹ 328,511 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

tailnum lowest avg arrival delay

not_cancelled %>% 
group_by(tailnum) %>% 
 summarize(
  delay = mean(arr_delay)) %>%
 arrange(delay)

## # A tibble: 4,037 × 2
##    tailnum delay
##    <chr>   <dbl>
##  1 N560AS  -53  
##  2 N315AS  -51  
##  3 N517AS  -40.5
##  4 N7AYAA  -35  
##  5 N585AS  -34.5
##  6 N915DN  -34  
##  7 N512AS  -31.5
##  8 N594AS  -31.3
##  9 N564AS  -31  
## 10 N593AS  -30.5
## # ℹ 4,027 more rows

departure delays more than 60 min

flights %>%
  group_by(month) %>%
  summarize(
    total_flights = n(),
    delayed_over_60 = sum(dep_delay > 60, na.rm = TRUE),
    prop_delayed = delayed_over_60 / total_flights
  ) %>%
  print()

## # A tibble: 12 × 4
##    month total_flights delayed_over_60 prop_delayed
##    <int>         <int>           <int>        <dbl>
##  1     1         27004            1821       0.0674
##  2     2         24951            1654       0.0663
##  3     3         28834            2340       0.0812
##  4     4         28330            2535       0.0895
##  5     5         28796            2309       0.0802
##  6     6         28243            3494       0.124 
##  7     7         29425            3820       0.130 
##  8     8         29327            2295       0.0783
##  9     9         27574            1330       0.0482
## 10    10         28889            1344       0.0465
## 11    11         27268            1086       0.0398
## 12    12         28135            2553       0.0907

destinations with most carriers

not_cancelled %>% 
 group_by(dest) %>% 
 summarize(carriers = n_distinct(carrier)) %>% 
 arrange(desc(carriers))

## # A tibble: 104 × 2
##    dest  carriers
##    <chr>    <int>
##  1 ATL          7
##  2 BOS          7
##  3 CLT          7
##  4 ORD          7
##  5 TPA          7
##  6 AUS          6
##  7 DCA          6
##  8 DTW          6
##  9 IAD          6
## 10 MSP          6
## # ℹ 94 more rows

Exam 1 Review

2025-10-09

packages

Example slides 1.1

Assignment 1 Review

# obs/variables

what data type is humidity

value of season in row 6251

how many obs are winter

# obs high wind during winter or spring

Assignment 2 Review

get microsoft daily stock symbols

# of days

when was stock price highest

Assignment 3 Review

is scraping allowed

scrape table data

Assignment 4

# missing values in Petal.Length

# and % of complete observations

what type of special values in numeric columns

locate inf and replace with placeholder

find obs that violate rules

locate obs that violate and correct

Sepal.Width mean

Petal.Length median

Sepal.Length linear regression

Petal.Width kNN

Base R approach

extract from mtcars

extract columns w string s

add new column hpCyl

extract rows on rule

summarize data in groups

dplyr approach

extract from mtcars

extract columns w string s

add new column hpCyl

extract rows on rule

summarize data in groups

Assignment 5

check code

tailnum lowest avg arrival delay

departure delays more than 60 min

destinations with most carriers