1. Print the structure of your dataset
str(penguins_size)
## spc_tbl_ [344 × 7] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ species : chr [1:344] "Adelie" "Adelie" "Adelie" "Adelie" ...
## $ island : chr [1:344] "Torgersen" "Torgersen" "Torgersen" "Torgersen" ...
## $ culmen_length_mm : num [1:344] 39.1 39.5 40.3 NA 36.7 39.3 38.9 39.2 34.1 42 ...
## $ culmen_depth_mm : num [1:344] 18.7 17.4 18 NA 19.3 20.6 17.8 19.6 18.1 20.2 ...
## $ flipper_length_mm: num [1:344] 181 186 195 NA 193 190 181 195 193 190 ...
## $ body_mass_g : num [1:344] 3750 3800 3250 NA 3450 ...
## $ sex : chr [1:344] "MALE" "FEMALE" "FEMALE" NA ...
## - attr(*, "spec")=
## .. cols(
## .. species = col_character(),
## .. island = col_character(),
## .. culmen_length_mm = col_double(),
## .. culmen_depth_mm = col_double(),
## .. flipper_length_mm = col_double(),
## .. body_mass_g = col_double(),
## .. sex = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
2. List the variables in your dataset
names(penguins_size)
## [1] "species" "island" "culmen_length_mm"
## [4] "culmen_depth_mm" "flipper_length_mm" "body_mass_g"
## [7] "sex"
3. Print the top 15 rows of your dataset
head(penguins_size, 15)
## # A tibble: 15 × 7
## species island culmen_length_mm culmen_depth_mm flipper_length_mm body_mass_g
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Adelie Torge… 39.1 18.7 181 3750
## 2 Adelie Torge… 39.5 17.4 186 3800
## 3 Adelie Torge… 40.3 18 195 3250
## 4 Adelie Torge… NA NA NA NA
## 5 Adelie Torge… 36.7 19.3 193 3450
## 6 Adelie Torge… 39.3 20.6 190 3650
## 7 Adelie Torge… 38.9 17.8 181 3625
## 8 Adelie Torge… 39.2 19.6 195 4675
## 9 Adelie Torge… 34.1 18.1 193 3475
## 10 Adelie Torge… 42 20.2 190 4250
## 11 Adelie Torge… 37.8 17.1 186 3300
## 12 Adelie Torge… 37.8 17.3 180 3700
## 13 Adelie Torge… 41.1 17.6 182 3200
## 14 Adelie Torge… 38.6 21.2 191 3800
## 15 Adelie Torge… 34.6 21.1 198 4400
## # ℹ 1 more variable: sex <chr>
4. Write a user defined function using any of the variables from the
data set.
Transfer grams to kilograms
# set function
grams_to_kg <- function(g) {
kg <- g/1000
return(kg)
}
# create a new column
penguins_with_kg <- penguins_size %>% mutate(body_mass_kg = grams_to_kg(body_mass_g))
#print top 15 row to see if the new column works
head(penguins_with_kg, 15)
## # A tibble: 15 × 8
## species island culmen_length_mm culmen_depth_mm flipper_length_mm body_mass_g
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Adelie Torge… 39.1 18.7 181 3750
## 2 Adelie Torge… 39.5 17.4 186 3800
## 3 Adelie Torge… 40.3 18 195 3250
## 4 Adelie Torge… NA NA NA NA
## 5 Adelie Torge… 36.7 19.3 193 3450
## 6 Adelie Torge… 39.3 20.6 190 3650
## 7 Adelie Torge… 38.9 17.8 181 3625
## 8 Adelie Torge… 39.2 19.6 195 4675
## 9 Adelie Torge… 34.1 18.1 193 3475
## 10 Adelie Torge… 42 20.2 190 4250
## 11 Adelie Torge… 37.8 17.1 186 3300
## 12 Adelie Torge… 37.8 17.3 180 3700
## 13 Adelie Torge… 41.1 17.6 182 3200
## 14 Adelie Torge… 38.6 21.2 191 3800
## 15 Adelie Torge… 34.6 21.1 198 4400
## # ℹ 2 more variables: sex <chr>, body_mass_kg <dbl>
5. Use data manipulation techniques and filter rows based on any
logical criteria that exist in your dataset.
Select penguins heavier than 5000g
penguins_size %>% filter(body_mass_g > 5000)
## # A tibble: 61 × 7
## species island culmen_length_mm culmen_depth_mm flipper_length_mm body_mass_g
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Gentoo Biscoe 50 16.3 230 5700
## 2 Gentoo Biscoe 50 15.2 218 5700
## 3 Gentoo Biscoe 47.6 14.5 215 5400
## 4 Gentoo Biscoe 46.7 15.3 219 5200
## 5 Gentoo Biscoe 46.8 15.4 215 5150
## 6 Gentoo Biscoe 49 16.1 216 5550
## 7 Gentoo Biscoe 48.4 14.6 213 5850
## 8 Gentoo Biscoe 49.3 15.7 217 5850
## 9 Gentoo Biscoe 49.2 15.2 221 6300
## 10 Gentoo Biscoe 48.7 15.1 222 5350
## # ℹ 51 more rows
## # ℹ 1 more variable: sex <chr>
6. Identify the dependent & independent variables and use
reshaping techniques and create a new data frame by joining those
variables from your dataset.
new_penguins = cbind(penguins_with_kg$species, penguins_with_kg$sex, penguins_with_kg$body_mass_kg)
new_penguins = as.data.frame(new_penguins)
names(new_penguins)[1]="species"
names(new_penguins)[2]="sex"
names(new_penguins)[3]="kg"
head(new_penguins, 20)
## species sex kg
## 1 Adelie MALE 3.75
## 2 Adelie FEMALE 3.8
## 3 Adelie FEMALE 3.25
## 4 Adelie <NA> <NA>
## 5 Adelie FEMALE 3.45
## 6 Adelie MALE 3.65
## 7 Adelie FEMALE 3.625
## 8 Adelie MALE 4.675
## 9 Adelie <NA> 3.475
## 10 Adelie <NA> 4.25
## 11 Adelie <NA> 3.3
## 12 Adelie <NA> 3.7
## 13 Adelie FEMALE 3.2
## 14 Adelie MALE 3.8
## 15 Adelie MALE 4.4
## 16 Adelie FEMALE 3.7
## 17 Adelie FEMALE 3.45
## 18 Adelie MALE 4.5
## 19 Adelie FEMALE 3.325
## 20 Adelie MALE 4.2
7. Remove missing values in your dataset.
penguins_clean <- na.omit(penguins_with_kg)
head(penguins_clean, 344)
## # A tibble: 334 × 8
## species island culmen_length_mm culmen_depth_mm flipper_length_mm body_mass_g
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Adelie Torge… 39.1 18.7 181 3750
## 2 Adelie Torge… 39.5 17.4 186 3800
## 3 Adelie Torge… 40.3 18 195 3250
## 4 Adelie Torge… 36.7 19.3 193 3450
## 5 Adelie Torge… 39.3 20.6 190 3650
## 6 Adelie Torge… 38.9 17.8 181 3625
## 7 Adelie Torge… 39.2 19.6 195 4675
## 8 Adelie Torge… 41.1 17.6 182 3200
## 9 Adelie Torge… 38.6 21.2 191 3800
## 10 Adelie Torge… 34.6 21.1 198 4400
## # ℹ 324 more rows
## # ℹ 2 more variables: sex <chr>, body_mass_kg <dbl>
8. dentify and remove duplicated data in your dataset
penguins_unique <- penguins_clean[!duplicated(penguins_clean), ]
head(penguins_unique,344)
## # A tibble: 334 × 8
## species island culmen_length_mm culmen_depth_mm flipper_length_mm body_mass_g
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Adelie Torge… 39.1 18.7 181 3750
## 2 Adelie Torge… 39.5 17.4 186 3800
## 3 Adelie Torge… 40.3 18 195 3250
## 4 Adelie Torge… 36.7 19.3 193 3450
## 5 Adelie Torge… 39.3 20.6 190 3650
## 6 Adelie Torge… 38.9 17.8 181 3625
## 7 Adelie Torge… 39.2 19.6 195 4675
## 8 Adelie Torge… 41.1 17.6 182 3200
## 9 Adelie Torge… 38.6 21.2 191 3800
## 10 Adelie Torge… 34.6 21.1 198 4400
## # ℹ 324 more rows
## # ℹ 2 more variables: sex <chr>, body_mass_kg <dbl>
9. Reorder multiple rows in descending order
penguins_clean %>% arrange(desc(species), desc(body_mass_kg))
## # A tibble: 334 × 8
## species island culmen_length_mm culmen_depth_mm flipper_length_mm body_mass_g
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Gentoo Biscoe 49.2 15.2 221 6300
## 2 Gentoo Biscoe 59.6 17 230 6050
## 3 Gentoo Biscoe 51.1 16.3 220 6000
## 4 Gentoo Biscoe 48.8 16.2 222 6000
## 5 Gentoo Biscoe 45.2 16.4 223 5950
## 6 Gentoo Biscoe 49.8 15.9 229 5950
## 7 Gentoo Biscoe 48.4 14.6 213 5850
## 8 Gentoo Biscoe 49.3 15.7 217 5850
## 9 Gentoo Biscoe 55.1 16 230 5850
## 10 Gentoo Biscoe 49.5 16.2 229 5800
## # ℹ 324 more rows
## # ℹ 2 more variables: sex <chr>, body_mass_kg <dbl>
10. Rename some of the column names in your dataset
names(penguins_unique)[8]="kilograms"
names(penguins_unique)[6]="grams"
names(penguins_unique)[3]="length(mm)"
names(penguins_unique)[4]="depth(mm)"
head(penguins_unique)
## # A tibble: 6 × 8
## species island `length(mm)` `depth(mm)` flipper_length_mm grams sex
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 Adelie Torgersen 39.1 18.7 181 3750 MALE
## 2 Adelie Torgersen 39.5 17.4 186 3800 FEMALE
## 3 Adelie Torgersen 40.3 18 195 3250 FEMALE
## 4 Adelie Torgersen 36.7 19.3 193 3450 FEMALE
## 5 Adelie Torgersen 39.3 20.6 190 3650 MALE
## 6 Adelie Torgersen 38.9 17.8 181 3625 FEMALE
## # ℹ 1 more variable: kilograms <dbl>
11. Add new variables in your data frame by using a mathematical
function (for e.g. – multiply an existing column by 2 and add it as a
new variable to your data frame)
penguins_unique$bill_ratio = penguins_unique$`length(mm)` / penguins_unique$`depth(mm)`
head(penguins_unique)
## # A tibble: 6 × 9
## species island `length(mm)` `depth(mm)` flipper_length_mm grams sex
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 Adelie Torgersen 39.1 18.7 181 3750 MALE
## 2 Adelie Torgersen 39.5 17.4 186 3800 FEMALE
## 3 Adelie Torgersen 40.3 18 195 3250 FEMALE
## 4 Adelie Torgersen 36.7 19.3 193 3450 FEMALE
## 5 Adelie Torgersen 39.3 20.6 190 3650 MALE
## 6 Adelie Torgersen 38.9 17.8 181 3625 FEMALE
## # ℹ 2 more variables: kilograms <dbl>, bill_ratio <dbl>
12. Create a training set using random number generator engine.
set.seed(123)
training_set = penguins_unique %>% sample_frac(0.05, replace = FALSE)
head(training_set,334)
## # A tibble: 17 × 9
## species island `length(mm)` `depth(mm)` flipper_length_mm grams sex
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 Chinstrap Dream 42.5 16.7 187 3350 FEMALE
## 2 Adelie Torgersen 34.4 18.4 184 3325 FEMALE
## 3 Chinstrap Dream 51.5 18.7 187 3250 MALE
## 4 Gentoo Biscoe 49.5 16.1 224 5650 MALE
## 5 Adelie Torgersen 41.4 18.5 202 3875 MALE
## 6 Gentoo Biscoe 43.4 14.4 218 4600 FEMALE
## 7 Gentoo Biscoe 45.8 14.6 210 4200 FEMALE
## 8 Gentoo Biscoe 47.3 15.3 222 5250 MALE
## 9 Gentoo Biscoe 45.2 14.8 212 5200 FEMALE
## 10 Chinstrap Dream 46.1 18.2 178 3250 FEMALE
## 11 Adelie Dream 40.8 18.9 208 4300 MALE
## 12 Adelie Dream 38.1 18.6 190 3700 FEMALE
## 13 Gentoo Biscoe 45.3 13.7 210 4300 FEMALE
## 14 Chinstrap Dream 48.1 16.4 199 3325 FEMALE
## 15 Gentoo Biscoe 49.8 15.9 229 5950 MALE
## 16 Adelie Dream 32.1 15.5 188 3050 FEMALE
## 17 Adelie Dream 39.5 16.7 178 3250 FEMALE
## # ℹ 2 more variables: kilograms <dbl>, bill_ratio <dbl>
13. Print the summary statistics of your dataset
summary(penguins_unique)
## species island length(mm) depth(mm)
## Length:334 Length:334 Min. :32.10 Min. :13.10
## Class :character Class :character 1st Qu.:39.50 1st Qu.:15.60
## Mode :character Mode :character Median :44.50 Median :17.30
## Mean :43.99 Mean :17.16
## 3rd Qu.:48.58 3rd Qu.:18.70
## Max. :59.60 Max. :21.50
## flipper_length_mm grams sex kilograms
## Min. :172 Min. :2700 Length:334 Min. :2.700
## 1st Qu.:190 1st Qu.:3550 Class :character 1st Qu.:3.550
## Median :197 Median :4050 Mode :character Median :4.050
## Mean :201 Mean :4209 Mean :4.209
## 3rd Qu.:213 3rd Qu.:4794 3rd Qu.:4.794
## Max. :231 Max. :6300 Max. :6.300
## bill_ratio
## Min. :1.640
## 1st Qu.:2.163
## Median :2.577
## Mean :2.608
## 3rd Qu.:3.096
## Max. :3.613
15. Plot a scatter plot for any 2 variables in your dataset
ggplot(data = penguins_unique, aes(x = flipper_length_mm, y = grams)) +
geom_point()

16. Plot a bar plot for any 2 variables in your dataset
ggplot(data = penguins_unique,aes(x = species))+geom_bar(fill = "aquamarine4")

17. Find the correlation between any 2 variables by applying Pearson
correlation
cor(penguins_unique$flipper_length_mm, penguins_unique$kilograms, method="pearson")
## [1] 0.873211