Library
library(dplyr)
library(gapminder)
Data
head(gapminder)
## # A tibble: 6 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 1952 28.8 8425333 779.
## 2 Afghanistan Asia 1957 30.3 9240934 821.
## 3 Afghanistan Asia 1962 32.0 10267083 853.
## 4 Afghanistan Asia 1967 34.0 11537966 836.
## 5 Afghanistan Asia 1972 36.1 13079460 740.
## 6 Afghanistan Asia 1977 38.4 14880372 786.
head(starwars)
## # A tibble: 6 × 14
## name height mass hair_…¹ skin_…² eye_c…³ birth…⁴ sex gender homew…⁵
## <chr> <int> <dbl> <chr> <chr> <chr> <dbl> <chr> <chr> <chr>
## 1 Luke Skywal… 172 77 blond fair blue 19 male mascu… Tatooi…
## 2 C-3PO 167 75 <NA> gold yellow 112 none mascu… Tatooi…
## 3 R2-D2 96 32 <NA> white,… red 33 none mascu… Naboo
## 4 Darth Vader 202 136 none white yellow 41.9 male mascu… Tatooi…
## 5 Leia Organa 150 49 brown light brown 19 fema… femin… Aldera…
## 6 Owen Lars 178 120 brown,… light blue 52 male mascu… Tatooi…
## # … with 4 more variables: species <chr>, films <list>, vehicles <list>,
## # starships <list>, and abbreviated variable names ¹hair_color, ²skin_color,
## # ³eye_color, ⁴birth_year, ⁵homeworld
Column Selection in R dplyr
gapminder %>%
select(country, year, pop)
## # A tibble: 1,704 × 3
## country year pop
## <fct> <int> <int>
## 1 Afghanistan 1952 8425333
## 2 Afghanistan 1957 9240934
## 3 Afghanistan 1962 10267083
## 4 Afghanistan 1967 11537966
## 5 Afghanistan 1972 13079460
## 6 Afghanistan 1977 14880372
## 7 Afghanistan 1982 12881816
## 8 Afghanistan 1987 13867957
## 9 Afghanistan 1992 16317921
## 10 Afghanistan 1997 22227415
## # … with 1,694 more rows
Data Filtering-1
gapminder %>%
select(country, year, lifeExp) %>%
filter(year == 2007)
## # A tibble: 142 × 3
## country year lifeExp
## <fct> <int> <dbl>
## 1 Afghanistan 2007 43.8
## 2 Albania 2007 76.4
## 3 Algeria 2007 72.3
## 4 Angola 2007 42.7
## 5 Argentina 2007 75.3
## 6 Australia 2007 81.2
## 7 Austria 2007 79.8
## 8 Bahrain 2007 75.6
## 9 Bangladesh 2007 64.1
## 10 Belgium 2007 79.4
## # … with 132 more rows
Data Filtering-2
gapminder %>%
select(country, year, lifeExp) %>%
filter(year == 2007, country == "Poland")
## # A tibble: 1 × 3
## country year lifeExp
## <fct> <int> <dbl>
## 1 Poland 2007 75.6
Data Filtering-3
gapminder %>%
select(country, year, lifeExp) %>%
filter(year == 2007, country %in% c("Poland", "Croatia"))
## # A tibble: 2 × 3
## country year lifeExp
## <fct> <int> <dbl>
## 1 Croatia 2007 75.7
## 2 Poland 2007 75.6
Data Filtering-4
starwars %>%
select(name, ends_with("color"))
## # A tibble: 87 × 4
## name hair_color skin_color eye_color
## <chr> <chr> <chr> <chr>
## 1 Luke Skywalker blond fair blue
## 2 C-3PO <NA> gold yellow
## 3 R2-D2 <NA> white, blue red
## 4 Darth Vader none white yellow
## 5 Leia Organa brown light brown
## 6 Owen Lars brown, grey light blue
## 7 Beru Whitesun lars brown light blue
## 8 R5-D4 <NA> white, red red
## 9 Biggs Darklighter black light brown
## 10 Obi-Wan Kenobi auburn, white fair blue-gray
## # … with 77 more rows
Select only numeric coloumn
starwars %>%
select_if(is.numeric)
## # A tibble: 87 × 3
## height mass birth_year
## <int> <dbl> <dbl>
## 1 172 77 19
## 2 167 75 112
## 3 96 32 33
## 4 202 136 41.9
## 5 150 49 19
## 6 178 120 52
## 7 165 75 47
## 8 97 32 NA
## 9 183 84 24
## 10 182 77 57
## # … with 77 more rows
Select only factor coloumn
mtcars %>%
select_if(is.factor)
## data frame with 0 columns and 32 rows
Data Filtering (condition &)
gapminder %>%
select(country, year, lifeExp) %>%
filter(year >= 2007& country %in% c("Poland", "Croatia"))
## # A tibble: 2 × 3
## country year lifeExp
## <fct> <int> <dbl>
## 1 Croatia 2007 75.7
## 2 Poland 2007 75.6
Data Filtering (condition or (|))
gapminder %>%
select(country, year, lifeExp) %>%
filter(year >= 2007 | country %in% c("Poland", "Croatia"))
## # A tibble: 164 × 3
## country year lifeExp
## <fct> <int> <dbl>
## 1 Afghanistan 2007 43.8
## 2 Albania 2007 76.4
## 3 Algeria 2007 72.3
## 4 Angola 2007 42.7
## 5 Argentina 2007 75.3
## 6 Australia 2007 81.2
## 7 Austria 2007 79.8
## 8 Bahrain 2007 75.6
## 9 Bangladesh 2007 64.1
## 10 Belgium 2007 79.4
## # … with 154 more rows
Data Filtering (Not condition (!))
gapminder %>%
select(country, year, lifeExp) %>%
filter(year >= 2007 | !country %in% c("Poland", "Croatia"))
## # A tibble: 1,682 × 3
## country year lifeExp
## <fct> <int> <dbl>
## 1 Afghanistan 1952 28.8
## 2 Afghanistan 1957 30.3
## 3 Afghanistan 1962 32.0
## 4 Afghanistan 1967 34.0
## 5 Afghanistan 1972 36.1
## 6 Afghanistan 1977 38.4
## 7 Afghanistan 1982 39.9
## 8 Afghanistan 1987 40.8
## 9 Afghanistan 1992 41.7
## 10 Afghanistan 1997 41.8
## # … with 1,672 more rows
Data Filtering (pattern matching)
gapminder %>%
select(country, year, lifeExp) %>%
filter(grepl("Po", country))
## # A tibble: 24 × 3
## country year lifeExp
## <fct> <int> <dbl>
## 1 Poland 1952 61.3
## 2 Poland 1957 65.8
## 3 Poland 1962 67.6
## 4 Poland 1967 69.6
## 5 Poland 1972 70.8
## 6 Poland 1977 70.7
## 7 Poland 1982 71.3
## 8 Poland 1987 71.0
## 9 Poland 1992 71.0
## 10 Poland 1997 72.8
## # … with 14 more rows
Data Ordering-1
gapminder %>%
select(country, year, lifeExp) %>%
filter(year == 2007) %>%
arrange(lifeExp)
## # A tibble: 142 × 3
## country year lifeExp
## <fct> <int> <dbl>
## 1 Swaziland 2007 39.6
## 2 Mozambique 2007 42.1
## 3 Zambia 2007 42.4
## 4 Sierra Leone 2007 42.6
## 5 Lesotho 2007 42.6
## 6 Angola 2007 42.7
## 7 Zimbabwe 2007 43.5
## 8 Afghanistan 2007 43.8
## 9 Central African Republic 2007 44.7
## 10 Liberia 2007 45.7
## # … with 132 more rows
Data Ordering-2
gapminder %>%
select(country, year, lifeExp) %>%
filter(year == 2007) %>%
arrange(desc(lifeExp))
## # A tibble: 142 × 3
## country year lifeExp
## <fct> <int> <dbl>
## 1 Japan 2007 82.6
## 2 Hong Kong, China 2007 82.2
## 3 Iceland 2007 81.8
## 4 Switzerland 2007 81.7
## 5 Australia 2007 81.2
## 6 Spain 2007 80.9
## 7 Sweden 2007 80.9
## 8 Israel 2007 80.7
## 9 France 2007 80.7
## 10 Canada 2007 80.7
## # … with 132 more rows
Select top values
gapminder %>%
select(country, year, lifeExp) %>%
filter(year == 2007) %>%
arrange(desc(lifeExp)) %>%
top_n(5)
## Selecting by lifeExp
## # A tibble: 5 × 3
## country year lifeExp
## <fct> <int> <dbl>
## 1 Japan 2007 82.6
## 2 Hong Kong, China 2007 82.2
## 3 Iceland 2007 81.8
## 4 Switzerland 2007 81.7
## 5 Australia 2007 81.2
Creating Columns-1
gapminder %>%
select(country, year, pop, gdpPercap) %>%
filter(year == 2007) %>%
mutate(gdp = pop * gdpPercap) %>%
arrange(desc(gdp)) %>%
top_n(5)
## Selecting by gdp
## # A tibble: 5 × 5
## country year pop gdpPercap gdp
## <fct> <int> <int> <dbl> <dbl>
## 1 United States 2007 301139947 42952. 1.29e13
## 2 China 2007 1318683096 4959. 6.54e12
## 3 Japan 2007 127467972 31656. 4.04e12
## 4 India 2007 1110396331 2452. 2.72e12
## 5 Germany 2007 82400996 32170. 2.65e12
Creating Columns-2
gapminder %>%
select(country, year, pop, gdpPercap) %>%
filter(year == 2007) %>%
transmute(gdp = pop * gdpPercap) %>%
arrange(desc(gdp)) %>%
top_n(5)
## Selecting by gdp
## # A tibble: 5 × 1
## gdp
## <dbl>
## 1 1.29e13
## 2 6.54e12
## 3 4.04e12
## 4 2.72e12
## 5 2.65e12
Creating Columns-3
starwars %>%
mutate(name, bmi = mass / ((height / 100) ^ 2)) %>%
select(name:mass, bmi)
## # A tibble: 87 × 4
## name height mass bmi
## <chr> <int> <dbl> <dbl>
## 1 Luke Skywalker 172 77 26.0
## 2 C-3PO 167 75 26.9
## 3 R2-D2 96 32 34.7
## 4 Darth Vader 202 136 33.3
## 5 Leia Organa 150 49 21.8
## 6 Owen Lars 178 120 37.9
## 7 Beru Whitesun lars 165 75 27.5
## 8 R5-D4 97 32 34.0
## 9 Biggs Darklighter 183 84 25.1
## 10 Obi-Wan Kenobi 182 77 23.2
## # … with 77 more rows
Creating Columns-4
starwars %>%
group_by(species) %>%
summarise(
n = n(),
mass = mean(mass, na.rm = TRUE)
) %>%
filter(
n > 1,
mass > 50
)
## # A tibble: 8 × 3
## species n mass
## <chr> <int> <dbl>
## 1 Droid 6 69.8
## 2 Gungan 3 74
## 3 Human 35 82.8
## 4 Kaminoan 2 88
## 5 Mirialan 2 53.1
## 6 Twi'lek 2 55
## 7 Wookiee 2 124
## 8 Zabrak 2 80
Creating Columns & if else
df =data.frame(x = c(1,5,6,NA))
df %>%
mutate(newvar=if_else(x<5, x+1, x+2,0))
## x newvar
## 1 1 2
## 2 5 7
## 3 6 8
## 4 NA 0
Creating Columns & Nested IF ELSE
mydf =data.frame(x = c(1:5,NA))
mydf %>% mutate(newvar= if_else(is.na(x),"I am missing",
if_else(x==1,"I am one",
if_else(x==2,"I am two",
if_else(x==3,
"I am three","Others")))))
## x newvar
## 1 1 I am one
## 2 2 I am two
## 3 3 I am three
## 4 4 Others
## 5 5 Others
## 6 NA I am missing
Creating Columns & WHEN Statement
mydf %>% mutate(flag = case_when(is.na(x) ~ "I am missing",
x == 1 ~ "I am one",
x == 2 ~ "I am two",
x == 3 ~ "I am three",
TRUE ~ "Others"))
## x flag
## 1 1 I am one
## 2 2 I am two
## 3 3 I am three
## 4 4 Others
## 5 5 Others
## 6 NA I am missing
Creating Columns & across()
mtcars %>%
group_by(carb) %>%
mutate(across(where(is.numeric), mean))
## # A tibble: 32 × 11
## # Groups: carb [6]
## mpg cyl disp hp drat wt qsec vs am gear carb
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 15.8 7.2 309. 187 3.60 3.90 17.0 0.2 0.3 3.6 4
## 2 15.8 7.2 309. 187 3.60 3.90 17.0 0.2 0.3 3.6 4
## 3 25.3 4.57 134. 86 3.68 2.49 19.5 1 0.571 3.57 1
## 4 25.3 4.57 134. 86 3.68 2.49 19.5 1 0.571 3.57 1
## 5 22.4 5.6 208. 117. 3.70 2.86 18.2 0.5 0.4 3.8 2
## 6 25.3 4.57 134. 86 3.68 2.49 19.5 1 0.571 3.57 1
## 7 15.8 7.2 309. 187 3.60 3.90 17.0 0.2 0.3 3.6 4
## 8 22.4 5.6 208. 117. 3.70 2.86 18.2 0.5 0.4 3.8 2
## 9 22.4 5.6 208. 117. 3.70 2.86 18.2 0.5 0.4 3.8 2
## 10 15.8 7.2 309. 187 3.60 3.90 17.0 0.2 0.3 3.6 4
## # … with 22 more rows
new columns whose value depends on some conditions
gapminder %>%
filter(year == 2007) %>%
group_by(continent) %>%
summarize(avgLifeExp = mean(lifeExp)) %>%
mutate(over75 = if_else(avgLifeExp > 75, "Y", "N"))
## # A tibble: 5 × 3
## continent avgLifeExp over75
## <fct> <dbl> <chr>
## 1 Africa 54.8 N
## 2 Americas 73.6 N
## 3 Asia 70.7 N
## 4 Europe 77.6 Y
## 5 Oceania 80.7 Y
Calculating Summary Statistics-1
gapminder %>%
summarize(avgLifeExp = mean(lifeExp))
## # A tibble: 1 × 1
## avgLifeExp
## <dbl>
## 1 59.5
Calculating Summary Statistics-2
gapminder %>%
filter(year == 2007, continent == "Europe") %>%
summarize(avgLifeExp = mean(lifeExp))
## # A tibble: 1 × 1
## avgLifeExp
## <dbl>
## 1 77.6
Grouping in R dplyr-1
gapminder %>%
filter(year == 2007) %>%
group_by(continent) %>%
summarize(avgLifeExp = mean(lifeExp))
## # A tibble: 5 × 2
## continent avgLifeExp
## <fct> <dbl>
## 1 Africa 54.8
## 2 Americas 73.6
## 3 Asia 70.7
## 4 Europe 77.6
## 5 Oceania 80.7
Grouping in R dplyr-2
gapminder %>%
filter(year == 2007) %>%
group_by(continent) %>%
summarize(avgLifeExp = mean(lifeExp)) %>%
arrange(desc(avgLifeExp))
## # A tibble: 5 × 2
## continent avgLifeExp
## <fct> <dbl>
## 1 Oceania 80.7
## 2 Europe 77.6
## 3 Americas 73.6
## 4 Asia 70.7
## 5 Africa 54.8
Grouping in R dplyr & across() function
mtcars %>%
group_by(carb) %>%
summarise(across(where(is.numeric), mean))
## # A tibble: 6 × 11
## carb mpg cyl disp hp drat wt qsec vs am gear
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 25.3 4.57 134. 86 3.68 2.49 19.5 1 0.571 3.57
## 2 2 22.4 5.6 208. 117. 3.70 2.86 18.2 0.5 0.4 3.8
## 3 3 16.3 8 276. 180 3.07 3.86 17.7 0 0 3
## 4 4 15.8 7.2 309. 187 3.60 3.90 17.0 0.2 0.3 3.6
## 5 6 19.7 6 145 175 3.62 2.77 15.5 0 1 5
## 6 8 15 8 301 335 3.54 3.57 14.6 0 1 5
Grouping in R dplyr & Multiple across() function
mtcars %>%
group_by(carb) %>%
summarise(across(mpg:qsec, mean), across(vs:gear, n_distinct))
## # A tibble: 6 × 11
## carb mpg cyl disp hp drat wt qsec vs am gear
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <int> <int>
## 1 1 25.3 4.57 134. 86 3.68 2.49 19.5 1 2 2
## 2 2 22.4 5.6 208. 117. 3.70 2.86 18.2 2 2 3
## 3 3 16.3 8 276. 180 3.07 3.86 17.7 1 1 1
## 4 4 15.8 7.2 309. 187 3.60 3.90 17.0 2 2 3
## 5 6 19.7 6 145 175 3.62 2.77 15.5 1 1 1
## 6 8 15 8 301 335 3.54 3.57 14.6 1 1 1
Data Slicing-1
gapminder %>%
slice(1)
## # A tibble: 1 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 1952 28.8 8425333 779.
Data Slicing-2
gapminder %>%
slice_head(n = 3)
## # A tibble: 3 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 1952 28.8 8425333 779.
## 2 Afghanistan Asia 1957 30.3 9240934 821.
## 3 Afghanistan Asia 1962 32.0 10267083 853.
Data Slicing-3
gapminder %>%
slice_tail(n = 3)
## # A tibble: 3 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Zimbabwe Africa 1997 46.8 11404948 792.
## 2 Zimbabwe Africa 2002 40.0 11926563 672.
## 3 Zimbabwe Africa 2007 43.5 12311143 470.
Data Slicing-4
gapminder %>%
slice_sample(n = 5)
## # A tibble: 5 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Haiti Americas 1962 43.6 3880130 1797.
## 2 Congo, Dem. Rep. Africa 1982 47.8 30646495 674.
## 3 Austria Europe 1987 74.9 7578903 23688.
## 4 Turkey Europe 1982 61.0 47328791 4241.
## 5 Algeria Africa 2007 72.3 33333216 6223.
Data Slicing-5
gapminder %>%
filter(year == 2007) %>%
slice_min(lifeExp, n = 5)
## # A tibble: 5 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Swaziland Africa 2007 39.6 1133066 4513.
## 2 Mozambique Africa 2007 42.1 19951656 824.
## 3 Zambia Africa 2007 42.4 11746035 1271.
## 4 Sierra Leone Africa 2007 42.6 6144562 863.
## 5 Lesotho Africa 2007 42.6 2012649 1569.
Data Slicing-6
gapminder %>%
filter(year == 2007) %>%
slice_max(lifeExp, n = 5)
## # A tibble: 5 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Japan Asia 2007 82.6 127467972 31656.
## 2 Hong Kong, China Asia 2007 82.2 6980412 39725.
## 3 Iceland Europe 2007 81.8 301931 36181.
## 4 Switzerland Europe 2007 81.7 7554661 37506.
## 5 Australia Oceania 2007 81.2 20434176 34435.
sample 3 values
sample_n(starwars,3)
## # A tibble: 3 × 14
## name height mass hair_c…¹ skin_…² eye_c…³ birth…⁴ sex gender homew…⁵
## <chr> <int> <dbl> <chr> <chr> <chr> <dbl> <chr> <chr> <chr>
## 1 Poe Dameron NA NA brown light brown NA male mascu… <NA>
## 2 Cliegg Lars 183 NA brown fair blue 82 male mascu… Tatooi…
## 3 Nute Gunray 191 90 none mottle… red NA male mascu… Cato N…
## # … with 4 more variables: species <chr>, films <list>, vehicles <list>,
## # starships <list>, and abbreviated variable names ¹hair_color, ²skin_color,
## # ³eye_color, ⁴birth_year, ⁵homeworld
sample 10% of rows
sample_frac(starwars,0.1)
## # A tibble: 9 × 14
## name height mass hair_…¹ skin_…² eye_c…³ birth…⁴ sex gender homew…⁵
## <chr> <int> <dbl> <chr> <chr> <chr> <dbl> <chr> <chr> <chr>
## 1 IG-88 200 140 none metal red 15 none mascu… <NA>
## 2 Ki-Adi-Mundi 198 82 white pale yellow 92 male mascu… Cerea
## 3 Yoda 66 17 white green brown 896 male mascu… <NA>
## 4 Owen Lars 178 120 brown,… light blue 52 male mascu… Tatooi…
## 5 Shmi Skywal… 163 NA black fair brown 72 fema… femin… Tatooi…
## 6 Sebulba 112 40 none grey, … orange NA male mascu… Malast…
## 7 Gregar Typho 185 85 black dark brown NA male mascu… Naboo
## 8 Darth Maul 175 80 none red yellow 54 male mascu… Dathom…
## 9 Jek Tono Po… 180 110 brown fair blue NA male mascu… Bestin…
## # … with 4 more variables: species <chr>, films <list>, vehicles <list>,
## # starships <list>, and abbreviated variable names ¹hair_color, ²skin_color,
## # ³eye_color, ⁴birth_year, ⁵homeworld
Remove Duplicate Rows based on all the variables
distinct(starwars)
## # A tibble: 87 × 14
## name height mass hair_…¹ skin_…² eye_c…³ birth…⁴ sex gender homew…⁵
## <chr> <int> <dbl> <chr> <chr> <chr> <dbl> <chr> <chr> <chr>
## 1 Luke Skywa… 172 77 blond fair blue 19 male mascu… Tatooi…
## 2 C-3PO 167 75 <NA> gold yellow 112 none mascu… Tatooi…
## 3 R2-D2 96 32 <NA> white,… red 33 none mascu… Naboo
## 4 Darth Vader 202 136 none white yellow 41.9 male mascu… Tatooi…
## 5 Leia Organa 150 49 brown light brown 19 fema… femin… Aldera…
## 6 Owen Lars 178 120 brown,… light blue 52 male mascu… Tatooi…
## 7 Beru White… 165 75 brown light blue 47 fema… femin… Tatooi…
## 8 R5-D4 97 32 <NA> white,… red NA none mascu… Tatooi…
## 9 Biggs Dark… 183 84 black light brown 24 male mascu… Tatooi…
## 10 Obi-Wan Ke… 182 77 auburn… fair blue-g… 57 male mascu… Stewjon
## # … with 77 more rows, 4 more variables: species <chr>, films <list>,
## # vehicles <list>, starships <list>, and abbreviated variable names
## # ¹hair_color, ²skin_color, ³eye_color, ⁴birth_year, ⁵homeworld
Remove Duplicate Rows based on a variable
distinct(starwars, height, .keep_all= TRUE)
## # A tibble: 46 × 14
## name height mass hair_…¹ skin_…² eye_c…³ birth…⁴ sex gender homew…⁵
## <chr> <int> <dbl> <chr> <chr> <chr> <dbl> <chr> <chr> <chr>
## 1 Luke Skywa… 172 77 blond fair blue 19 male mascu… Tatooi…
## 2 C-3PO 167 75 <NA> gold yellow 112 none mascu… Tatooi…
## 3 R2-D2 96 32 <NA> white,… red 33 none mascu… Naboo
## 4 Darth Vader 202 136 none white yellow 41.9 male mascu… Tatooi…
## 5 Leia Organa 150 49 brown light brown 19 fema… femin… Aldera…
## 6 Owen Lars 178 120 brown,… light blue 52 male mascu… Tatooi…
## 7 Beru White… 165 75 brown light blue 47 fema… femin… Tatooi…
## 8 R5-D4 97 32 <NA> white,… red NA none mascu… Tatooi…
## 9 Biggs Dark… 183 84 black light brown 24 male mascu… Tatooi…
## 10 Obi-Wan Ke… 182 77 auburn… fair blue-g… 57 male mascu… Stewjon
## # … with 36 more rows, 4 more variables: species <chr>, films <list>,
## # vehicles <list>, starships <list>, and abbreviated variable names
## # ¹hair_color, ²skin_color, ³eye_color, ⁴birth_year, ⁵homeworld
Remove Duplicates Rows based on multiple variables
distinct(starwars, height, mass, .keep_all= TRUE)
## # A tibble: 77 × 14
## name height mass hair_…¹ skin_…² eye_c…³ birth…⁴ sex gender homew…⁵
## <chr> <int> <dbl> <chr> <chr> <chr> <dbl> <chr> <chr> <chr>
## 1 Luke Skywa… 172 77 blond fair blue 19 male mascu… Tatooi…
## 2 C-3PO 167 75 <NA> gold yellow 112 none mascu… Tatooi…
## 3 R2-D2 96 32 <NA> white,… red 33 none mascu… Naboo
## 4 Darth Vader 202 136 none white yellow 41.9 male mascu… Tatooi…
## 5 Leia Organa 150 49 brown light brown 19 fema… femin… Aldera…
## 6 Owen Lars 178 120 brown,… light blue 52 male mascu… Tatooi…
## 7 Beru White… 165 75 brown light blue 47 fema… femin… Tatooi…
## 8 R5-D4 97 32 <NA> white,… red NA none mascu… Tatooi…
## 9 Biggs Dark… 183 84 black light brown 24 male mascu… Tatooi…
## 10 Obi-Wan Ke… 182 77 auburn… fair blue-g… 57 male mascu… Stewjon
## # … with 67 more rows, 4 more variables: species <chr>, films <list>,
## # vehicles <list>, starships <list>, and abbreviated variable names
## # ¹hair_color, ²skin_color, ³eye_color, ⁴birth_year, ⁵homeworld
Dropping Variables -1
select(starwars, -height, -mass)
## # A tibble: 87 × 12
## name hair_…¹ skin_…² eye_c…³ birth…⁴ sex gender homew…⁵ species films
## <chr> <chr> <chr> <chr> <dbl> <chr> <chr> <chr> <chr> <lis>
## 1 Luke Skyw… blond fair blue 19 male mascu… Tatooi… Human <chr>
## 2 C-3PO <NA> gold yellow 112 none mascu… Tatooi… Droid <chr>
## 3 R2-D2 <NA> white,… red 33 none mascu… Naboo Droid <chr>
## 4 Darth Vad… none white yellow 41.9 male mascu… Tatooi… Human <chr>
## 5 Leia Orga… brown light brown 19 fema… femin… Aldera… Human <chr>
## 6 Owen Lars brown,… light blue 52 male mascu… Tatooi… Human <chr>
## 7 Beru Whit… brown light blue 47 fema… femin… Tatooi… Human <chr>
## 8 R5-D4 <NA> white,… red NA none mascu… Tatooi… Droid <chr>
## 9 Biggs Dar… black light brown 24 male mascu… Tatooi… Human <chr>
## 10 Obi-Wan K… auburn… fair blue-g… 57 male mascu… Stewjon Human <chr>
## # … with 77 more rows, 2 more variables: vehicles <list>, starships <list>, and
## # abbreviated variable names ¹hair_color, ²skin_color, ³eye_color,
## # ⁴birth_year, ⁵homeworld
Dropping Variables -2
select(starwars, -c(height, mass))
## # A tibble: 87 × 12
## name hair_…¹ skin_…² eye_c…³ birth…⁴ sex gender homew…⁵ species films
## <chr> <chr> <chr> <chr> <dbl> <chr> <chr> <chr> <chr> <lis>
## 1 Luke Skyw… blond fair blue 19 male mascu… Tatooi… Human <chr>
## 2 C-3PO <NA> gold yellow 112 none mascu… Tatooi… Droid <chr>
## 3 R2-D2 <NA> white,… red 33 none mascu… Naboo Droid <chr>
## 4 Darth Vad… none white yellow 41.9 male mascu… Tatooi… Human <chr>
## 5 Leia Orga… brown light brown 19 fema… femin… Aldera… Human <chr>
## 6 Owen Lars brown,… light blue 52 male mascu… Tatooi… Human <chr>
## 7 Beru Whit… brown light blue 47 fema… femin… Tatooi… Human <chr>
## 8 R5-D4 <NA> white,… red NA none mascu… Tatooi… Droid <chr>
## 9 Biggs Dar… black light brown 24 male mascu… Tatooi… Human <chr>
## 10 Obi-Wan K… auburn… fair blue-g… 57 male mascu… Stewjon Human <chr>
## # … with 77 more rows, 2 more variables: vehicles <list>, starships <list>, and
## # abbreviated variable names ¹hair_color, ²skin_color, ³eye_color,
## # ⁴birth_year, ⁵homeworld
rename
rename(starwars, height1=height)
## # A tibble: 87 × 14
## name height1 mass hair_…¹ skin_…² eye_c…³ birth…⁴ sex gender homew…⁵
## <chr> <int> <dbl> <chr> <chr> <chr> <dbl> <chr> <chr> <chr>
## 1 Luke Skyw… 172 77 blond fair blue 19 male mascu… Tatooi…
## 2 C-3PO 167 75 <NA> gold yellow 112 none mascu… Tatooi…
## 3 R2-D2 96 32 <NA> white,… red 33 none mascu… Naboo
## 4 Darth Vad… 202 136 none white yellow 41.9 male mascu… Tatooi…
## 5 Leia Orga… 150 49 brown light brown 19 fema… femin… Aldera…
## 6 Owen Lars 178 120 brown,… light blue 52 male mascu… Tatooi…
## 7 Beru Whit… 165 75 brown light blue 47 fema… femin… Tatooi…
## 8 R5-D4 97 32 <NA> white,… red NA none mascu… Tatooi…
## 9 Biggs Dar… 183 84 black light brown 24 male mascu… Tatooi…
## 10 Obi-Wan K… 182 77 auburn… fair blue-g… 57 male mascu… Stewjon
## # … with 77 more rows, 4 more variables: species <chr>, films <list>,
## # vehicles <list>, starships <list>, and abbreviated variable names
## # ¹hair_color, ²skin_color, ³eye_color, ⁴birth_year, ⁵homeworld
Inner Join
df1 = data.frame(ID = c(1, 2, 3, 4, 5),
w = c('a', 'b', 'c', 'd', 'e'),
x = c(1, 1, 0, 0, 1),
y=rnorm(5),
z=letters[1:5])
df2 = data.frame(ID = c(1, 7, 3, 6, 8),
a = c('z', 'b', 'k', 'd', 'l'),
b = c(1, 2, 3, 0, 4),
c =rnorm(5),
d =letters[2:6])
inner_join(df1, df2, by = "ID")
## ID w x y z a b c d
## 1 1 a 1 -0.5690531 a z 1 -1.110319 b
## 2 3 c 0 0.1303399 c k 3 1.706544 d
primary key does not have same name in both the tables
df1<-rename(df1, ID1=ID)
inner_join(df1, df2, by = c("ID1"="ID"))
## ID1 w x y z a b c d
## 1 1 a 1 -0.5690531 a z 1 -1.110319 b
## 2 3 c 0 0.1303399 c k 3 1.706544 d
Left Join
left_join(df1, df2, by = c("ID1"="ID"))
## ID1 w x y z a b c d
## 1 1 a 1 -0.5690531 a z 1 -1.110319 b
## 2 2 b 1 -0.9262669 b <NA> NA NA <NA>
## 3 3 c 0 0.1303399 c k 3 1.706544 d
## 4 4 d 0 1.3308165 d <NA> NA NA <NA>
## 5 5 e 1 -1.1404569 e <NA> NA NA <NA>
rbind & cbind
df1=data.frame(ID = 1:6, x=letters[1:6])
df2=data.frame(ID = 7:12, x=letters[7:12])
rbind(df1,df2)
## ID x
## 1 1 a
## 2 2 b
## 3 3 c
## 4 4 d
## 5 5 e
## 6 6 f
## 7 7 g
## 8 8 h
## 9 9 i
## 10 10 j
## 11 11 k
## 12 12 l
cbind(df1,df2)
## ID x ID x
## 1 1 a 7 g
## 2 2 b 8 h
## 3 3 c 9 i
## 4 4 d 10 j
## 5 5 e 11 k
## 6 6 f 12 l
Prepare Sample Data for Demonstration
mtcars$model <- rownames(mtcars)
str(mtcars)
## 'data.frame': 32 obs. of 12 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp : num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat : num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec : num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear : num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb : num 4 4 1 1 2 1 4 2 2 4 ...
## $ model: chr "Mazda RX4" "Mazda RX4 Wag" "Datsun 710" "Hornet 4 Drive" ...
first <- mtcars[1:20, ]
second <- mtcars[10:32, ]