library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(quantmod)
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## ######################### Warning from 'xts' package ##########################
## # #
## # The dplyr lag() function breaks how base R's lag() function is supposed to #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or #
## # source() into this session won't work correctly. #
## # #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop #
## # dplyr from breaking base R's lag() function. #
## # #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning. #
## # #
## ###############################################################################
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(robotstxt)
library(rvest)
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
library(nycflights13)
x_vec <- c(1,2,3,4)
x_vec > 2
## [1] FALSE FALSE TRUE TRUE
sum(x_vec>2)
## [1] 2
bike1 <- read.csv("Datasets/bike_sharing_data.csv")
glimpse(bike1)
## Rows: 17,379
## Columns: 13
## $ datetime <chr> "1/1/2011 0:00", "1/1/2011 1:00", "1/1/2011 2:00", "1/1/201…
## $ season <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ holiday <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ workingday <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ weather <int> 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3,…
## $ temp <dbl> 9.84, 9.02, 9.02, 9.84, 9.84, 9.84, 9.02, 8.20, 9.84, 13.12…
## $ atemp <dbl> 14.395, 13.635, 13.635, 14.395, 14.395, 12.880, 13.635, 12.…
## $ humidity <chr> "81", "80", "80", "75", "75", "75", "80", "86", "75", "76",…
## $ windspeed <dbl> 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 6.0032, 0.0000, 0.0…
## $ casual <int> 3, 8, 5, 3, 0, 0, 2, 1, 1, 8, 12, 26, 29, 47, 35, 40, 41, 1…
## $ registered <int> 13, 32, 27, 10, 1, 1, 0, 2, 7, 6, 24, 30, 55, 47, 71, 70, 5…
## $ count <int> 16, 40, 32, 13, 1, 1, 2, 3, 8, 14, 36, 56, 84, 94, 106, 110…
## $ sources <chr> "ad campaign", "www.yahoo.com", "www.google.fi", "AD campai…
str(bike1)
## 'data.frame': 17379 obs. of 13 variables:
## $ datetime : chr "1/1/2011 0:00" "1/1/2011 1:00" "1/1/2011 2:00" "1/1/2011 3:00" ...
## $ season : int 1 1 1 1 1 1 1 1 1 1 ...
## $ holiday : int 0 0 0 0 0 0 0 0 0 0 ...
## $ workingday: int 0 0 0 0 0 0 0 0 0 0 ...
## $ weather : int 1 1 1 1 1 2 1 1 1 1 ...
## $ temp : num 9.84 9.02 9.02 9.84 9.84 ...
## $ atemp : num 14.4 13.6 13.6 14.4 14.4 ...
## $ humidity : chr "81" "80" "80" "75" ...
## $ windspeed : num 0 0 0 0 0 ...
## $ casual : int 3 8 5 3 0 0 2 1 1 8 ...
## $ registered: int 13 32 27 10 1 1 0 2 7 6 ...
## $ count : int 16 40 32 13 1 1 2 3 8 14 ...
## $ sources : chr "ad campaign" "www.yahoo.com" "www.google.fi" "AD campaign" ...
bike1$season[6251]
## [1] 4
dim(subset(bike1,season==4))
## [1] 4232 13
dim(subset(bike1, windspeed >= 40 & season %in% c(1,4)))
## [1] 46 13
getSymbols("MSFT", src="yahoo",from = "2024-10-01", to = "2025-02-01")
## [1] "MSFT"
dim(MSFT)
## [1] 84 6
plot(MSFT$MSFT.Open)
paths_allowed("https://en.wikipedia.org/wiki/2024_in_film")
## en.wikipedia.org
## [1] TRUE
link <- "https://en.wikipedia.org/wiki/2024_in_film"
page <- read_html(link)
tbls <- html_elements(page, "table")
highest_gross <- html_table(tbls[[2]])
highest_gross
## # A tibble: 10 × 4
## Rank Title Distributor `Worldwide gross`
## <int> <chr> <chr> <chr>
## 1 1 Inside Out 2 Disney $1,698,863,816
## 2 2 Deadpool & Wolverine Disney $1,338,073,645
## 3 3 Moana 2 Disney $1,059,242,164
## 4 4 Despicable Me 4 Universal $971,315,095
## 5 5 Wicked Universal $756,535,681
## 6 6 Mufasa: The Lion King Disney $723,060,982
## 7 7 Dune: Part Two Warner Bros. $715,409,065
## 8 8 Godzilla x Kong: The New Empire Warner Bros. $572,505,338
## 9 9 Kung Fu Panda 4 Universal $547,689,492
## 10 10 Sonic the Hedgehog 3 Paramount $492,162,604
dirty_iris <- read.csv("https://raw.githubusercontent.com/edwindj/datacleaning/master/data/dirty_iris.csv")
sum(is.na(dirty_iris$Petal.Length))
## [1] 19
num_complete <- sum(complete.cases(dirty_iris))
num_complete
## [1] 96
nrow(na.omit(dirty_iris))
## [1] 96
pct_complete <- num_complete / nrow(dirty_iris) * 100
pct_complete
## [1] 64
summary(dirty_iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. : 0.000 Min. :-3.000 Min. : 0.00 Min. :0.1
## 1st Qu.: 5.100 1st Qu.: 2.800 1st Qu.: 1.60 1st Qu.:0.3
## Median : 5.750 Median : 3.000 Median : 4.50 Median :1.3
## Mean : 6.559 Mean : 3.391 Mean : 4.45 Mean :Inf
## 3rd Qu.: 6.400 3rd Qu.: 3.300 3rd Qu.: 5.10 3rd Qu.:1.8
## Max. :73.000 Max. :30.000 Max. :63.00 Max. :Inf
## NA's :10 NA's :17 NA's :19 NA's :12
## Species
## Length:150
## Class :character
## Mode :character
##
##
##
##
inf_indicator <- is.infinite(dirty_iris$Petal.Width)
dirty_iris$Petal.Width[inf_indicator] <- NA
summary(dirty_iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. : 0.000 Min. :-3.000 Min. : 0.00 Min. :0.100
## 1st Qu.: 5.100 1st Qu.: 2.800 1st Qu.: 1.60 1st Qu.:0.300
## Median : 5.750 Median : 3.000 Median : 4.50 Median :1.300
## Mean : 6.559 Mean : 3.391 Mean : 4.45 Mean :1.207
## 3rd Qu.: 6.400 3rd Qu.: 3.300 3rd Qu.: 5.10 3rd Qu.:1.800
## Max. :73.000 Max. :30.000 Max. :63.00 Max. :2.500
## NA's :10 NA's :17 NA's :19 NA's :13
## Species
## Length:150
## Class :character
## Mode :character
##
##
##
##
rules_violate <- subset(dirty_iris,c((Sepal.Width<=0)|(Sepal.Length>30)))
nrow(rules_violate)
## [1] 4
neg_indicator <- which(dirty_iris$Sepal.Width<0)
dirty_iris$Sepal.Width[neg_indicator]<-abs(dirty_iris$Sepal.Width[neg_indicator])
dirty_iris[neg_indicator,]
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 16 5 3 3.5 1 versicolor
zero_indicator <- which(dirty_iris$Sepal.Width == 0)
dirty_iris$Sepal.Width[zero_indicator] <- NA
iris_mean <- mean(dirty_iris$Sepal.Width, na.rm=TRUE)
iris_mean
## [1] 3.462121
iris_median<- median(dirty_iris$Petal.Length, na.rm=TRUE)
iris_median
## [1] 4.5
model <- lm(Sepal.Length~Sepal.Width+Petal.Width, data=dirty_iris)
I <- is.na(dirty_iris$Sepal.Length)
to_be_imputed <- dirty_iris[I,]
dirty_iris$Sepal.Length[I] <- predict(model, newdata = to_be_imputed)
iris_nearest <- kNN(dirty_iris)
## Sepal.Length Petal.Length Petal.Width Sepal.Length Petal.Length Petal.Width
## 0.0 0.0 0.1 73.0 63.0 2.5
## Sepal.Length Sepal.Width Petal.Width Sepal.Length Sepal.Width Petal.Width
## 0.0 2.2 0.1 73.0 30.0 2.5
## Sepal.Length Sepal.Width Petal.Length Sepal.Length Sepal.Width Petal.Length
## 0.0 2.2 0.0 73.0 30.0 63.0
#iris_nearest
data("mtcars")
mtcars[ ,c("mpg", "cyl")]
## mpg cyl
## Mazda RX4 21.0 6
## Mazda RX4 Wag 21.0 6
## Datsun 710 22.8 4
## Hornet 4 Drive 21.4 6
## Hornet Sportabout 18.7 8
## Valiant 18.1 6
## Duster 360 14.3 8
## Merc 240D 24.4 4
## Merc 230 22.8 4
## Merc 280 19.2 6
## Merc 280C 17.8 6
## Merc 450SE 16.4 8
## Merc 450SL 17.3 8
## Merc 450SLC 15.2 8
## Cadillac Fleetwood 10.4 8
## Lincoln Continental 10.4 8
## Chrysler Imperial 14.7 8
## Fiat 128 32.4 4
## Honda Civic 30.4 4
## Toyota Corolla 33.9 4
## Toyota Corona 21.5 4
## Dodge Challenger 15.5 8
## AMC Javelin 15.2 8
## Camaro Z28 13.3 8
## Pontiac Firebird 19.2 8
## Fiat X1-9 27.3 4
## Porsche 914-2 26.0 4
## Lotus Europa 30.4 4
## Ford Pantera L 15.8 8
## Ferrari Dino 19.7 6
## Maserati Bora 15.0 8
## Volvo 142E 21.4 4
mtcars[grep("s", names(mtcars))]
## disp qsec vs
## Mazda RX4 160.0 16.46 0
## Mazda RX4 Wag 160.0 17.02 0
## Datsun 710 108.0 18.61 1
## Hornet 4 Drive 258.0 19.44 1
## Hornet Sportabout 360.0 17.02 0
## Valiant 225.0 20.22 1
## Duster 360 360.0 15.84 0
## Merc 240D 146.7 20.00 1
## Merc 230 140.8 22.90 1
## Merc 280 167.6 18.30 1
## Merc 280C 167.6 18.90 1
## Merc 450SE 275.8 17.40 0
## Merc 450SL 275.8 17.60 0
## Merc 450SLC 275.8 18.00 0
## Cadillac Fleetwood 472.0 17.98 0
## Lincoln Continental 460.0 17.82 0
## Chrysler Imperial 440.0 17.42 0
## Fiat 128 78.7 19.47 1
## Honda Civic 75.7 18.52 1
## Toyota Corolla 71.1 19.90 1
## Toyota Corona 120.1 20.01 1
## Dodge Challenger 318.0 16.87 0
## AMC Javelin 304.0 17.30 0
## Camaro Z28 350.0 15.41 0
## Pontiac Firebird 400.0 17.05 0
## Fiat X1-9 79.0 18.90 1
## Porsche 914-2 120.3 16.70 0
## Lotus Europa 95.1 16.90 1
## Ford Pantera L 351.0 14.50 0
## Ferrari Dino 145.0 15.50 0
## Maserati Bora 301.0 14.60 0
## Volvo 142E 121.0 18.60 1
mtcars$hpCyl <- mtcars$hp + mtcars$cyl
subset(mtcars, (cyl<=6) & (hp>500))
## [1] mpg cyl disp hp drat wt qsec vs am gear carb hpCyl
## <0 rows> (or 0-length row.names)
cyl4 <- mtcars[which(mtcars$cyl==4),]
mean(cyl4$disp)
## [1] 105.1364
max(cyl4$hp)
## [1] 113
select(mtcars, mpg, cyl)
## mpg cyl
## Mazda RX4 21.0 6
## Mazda RX4 Wag 21.0 6
## Datsun 710 22.8 4
## Hornet 4 Drive 21.4 6
## Hornet Sportabout 18.7 8
## Valiant 18.1 6
## Duster 360 14.3 8
## Merc 240D 24.4 4
## Merc 230 22.8 4
## Merc 280 19.2 6
## Merc 280C 17.8 6
## Merc 450SE 16.4 8
## Merc 450SL 17.3 8
## Merc 450SLC 15.2 8
## Cadillac Fleetwood 10.4 8
## Lincoln Continental 10.4 8
## Chrysler Imperial 14.7 8
## Fiat 128 32.4 4
## Honda Civic 30.4 4
## Toyota Corolla 33.9 4
## Toyota Corona 21.5 4
## Dodge Challenger 15.5 8
## AMC Javelin 15.2 8
## Camaro Z28 13.3 8
## Pontiac Firebird 19.2 8
## Fiat X1-9 27.3 4
## Porsche 914-2 26.0 4
## Lotus Europa 30.4 4
## Ford Pantera L 15.8 8
## Ferrari Dino 19.7 6
## Maserati Bora 15.0 8
## Volvo 142E 21.4 4
mtcars %>%
select(contains("s"))
## disp qsec vs
## Mazda RX4 160.0 16.46 0
## Mazda RX4 Wag 160.0 17.02 0
## Datsun 710 108.0 18.61 1
## Hornet 4 Drive 258.0 19.44 1
## Hornet Sportabout 360.0 17.02 0
## Valiant 225.0 20.22 1
## Duster 360 360.0 15.84 0
## Merc 240D 146.7 20.00 1
## Merc 230 140.8 22.90 1
## Merc 280 167.6 18.30 1
## Merc 280C 167.6 18.90 1
## Merc 450SE 275.8 17.40 0
## Merc 450SL 275.8 17.60 0
## Merc 450SLC 275.8 18.00 0
## Cadillac Fleetwood 472.0 17.98 0
## Lincoln Continental 460.0 17.82 0
## Chrysler Imperial 440.0 17.42 0
## Fiat 128 78.7 19.47 1
## Honda Civic 75.7 18.52 1
## Toyota Corolla 71.1 19.90 1
## Toyota Corona 120.1 20.01 1
## Dodge Challenger 318.0 16.87 0
## AMC Javelin 304.0 17.30 0
## Camaro Z28 350.0 15.41 0
## Pontiac Firebird 400.0 17.05 0
## Fiat X1-9 79.0 18.90 1
## Porsche 914-2 120.3 16.70 0
## Lotus Europa 95.1 16.90 1
## Ford Pantera L 351.0 14.50 0
## Ferrari Dino 145.0 15.50 0
## Maserati Bora 301.0 14.60 0
## Volvo 142E 121.0 18.60 1
mtcars %>%
mutate(hpCyl = hp + cyl)
## mpg cyl disp hp drat wt qsec vs am gear carb hpCyl
## Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4 116
## Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4 116
## Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1 97
## Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1 116
## Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2 183
## Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1 111
## Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4 253
## Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2 66
## Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2 99
## Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4 129
## Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4 129
## Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3 188
## Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3 188
## Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3 188
## Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4 213
## Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4 223
## Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4 238
## Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1 70
## Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2 56
## Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1 69
## Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1 101
## Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2 158
## AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2 158
## Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4 253
## Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2 183
## Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1 70
## Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2 95
## Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2 117
## Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4 272
## Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6 181
## Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8 343
## Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2 113
filter(mtcars, cyl <= 6, hp>500)
## [1] mpg cyl disp hp drat wt qsec vs am gear carb hpCyl
## <0 rows> (or 0-length row.names)
stats_cyl <- mtcars %>%
group_by(cyl) %>%
summarize(mean(disp), max(hp))
#stats_cyl
not_cancelled <- flights %>%
filter(!is.na(dep_delay))
not_cancelled %>%
group_by(year, month, day)
## # A tibble: 328,521 × 19
## # Groups: year, month, day [365]
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # ℹ 328,511 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
not_cancelled %>%
group_by(tailnum) %>%
summarize(
delay = mean(arr_delay)) %>%
arrange(delay)
## # A tibble: 4,037 × 2
## tailnum delay
## <chr> <dbl>
## 1 N560AS -53
## 2 N315AS -51
## 3 N517AS -40.5
## 4 N7AYAA -35
## 5 N585AS -34.5
## 6 N915DN -34
## 7 N512AS -31.5
## 8 N594AS -31.3
## 9 N564AS -31
## 10 N593AS -30.5
## # ℹ 4,027 more rows
flights %>%
group_by(month) %>%
summarize(
total_flights = n(),
delayed_over_60 = sum(dep_delay > 60, na.rm = TRUE),
prop_delayed = delayed_over_60 / total_flights
) %>%
print()
## # A tibble: 12 × 4
## month total_flights delayed_over_60 prop_delayed
## <int> <int> <int> <dbl>
## 1 1 27004 1821 0.0674
## 2 2 24951 1654 0.0663
## 3 3 28834 2340 0.0812
## 4 4 28330 2535 0.0895
## 5 5 28796 2309 0.0802
## 6 6 28243 3494 0.124
## 7 7 29425 3820 0.130
## 8 8 29327 2295 0.0783
## 9 9 27574 1330 0.0482
## 10 10 28889 1344 0.0465
## 11 11 27268 1086 0.0398
## 12 12 28135 2553 0.0907
not_cancelled %>%
group_by(dest) %>%
summarize(carriers = n_distinct(carrier)) %>%
arrange(desc(carriers))
## # A tibble: 104 × 2
## dest carriers
## <chr> <int>
## 1 ATL 7
## 2 BOS 7
## 3 CLT 7
## 4 ORD 7
## 5 TPA 7
## 6 AUS 6
## 7 DCA 6
## 8 DTW 6
## 9 IAD 6
## 10 MSP 6
## # ℹ 94 more rows