2. List the variables in your dataset

names(penguins_size)
## [1] "species"           "island"            "culmen_length_mm" 
## [4] "culmen_depth_mm"   "flipper_length_mm" "body_mass_g"      
## [7] "sex"

4. Write a user defined function using any of the variables from the data set.

Transfer grams to kilograms

# set function
grams_to_kg <- function(g) {
  kg <- g/1000
  return(kg)
}
# create a new column
penguins_with_kg <- penguins_size %>% mutate(body_mass_kg = grams_to_kg(body_mass_g))
#print top 15 row to see if the new column works
head(penguins_with_kg, 15)
## # A tibble: 15 × 8
##    species island culmen_length_mm culmen_depth_mm flipper_length_mm body_mass_g
##    <chr>   <chr>             <dbl>           <dbl>             <dbl>       <dbl>
##  1 Adelie  Torge…             39.1            18.7               181        3750
##  2 Adelie  Torge…             39.5            17.4               186        3800
##  3 Adelie  Torge…             40.3            18                 195        3250
##  4 Adelie  Torge…             NA              NA                  NA          NA
##  5 Adelie  Torge…             36.7            19.3               193        3450
##  6 Adelie  Torge…             39.3            20.6               190        3650
##  7 Adelie  Torge…             38.9            17.8               181        3625
##  8 Adelie  Torge…             39.2            19.6               195        4675
##  9 Adelie  Torge…             34.1            18.1               193        3475
## 10 Adelie  Torge…             42              20.2               190        4250
## 11 Adelie  Torge…             37.8            17.1               186        3300
## 12 Adelie  Torge…             37.8            17.3               180        3700
## 13 Adelie  Torge…             41.1            17.6               182        3200
## 14 Adelie  Torge…             38.6            21.2               191        3800
## 15 Adelie  Torge…             34.6            21.1               198        4400
## # ℹ 2 more variables: sex <chr>, body_mass_kg <dbl>

5. Use data manipulation techniques and filter rows based on any logical criteria that exist in your dataset.

Select penguins heavier than 5000g

penguins_size %>% filter(body_mass_g > 5000)
## # A tibble: 61 × 7
##    species island culmen_length_mm culmen_depth_mm flipper_length_mm body_mass_g
##    <chr>   <chr>             <dbl>           <dbl>             <dbl>       <dbl>
##  1 Gentoo  Biscoe             50              16.3               230        5700
##  2 Gentoo  Biscoe             50              15.2               218        5700
##  3 Gentoo  Biscoe             47.6            14.5               215        5400
##  4 Gentoo  Biscoe             46.7            15.3               219        5200
##  5 Gentoo  Biscoe             46.8            15.4               215        5150
##  6 Gentoo  Biscoe             49              16.1               216        5550
##  7 Gentoo  Biscoe             48.4            14.6               213        5850
##  8 Gentoo  Biscoe             49.3            15.7               217        5850
##  9 Gentoo  Biscoe             49.2            15.2               221        6300
## 10 Gentoo  Biscoe             48.7            15.1               222        5350
## # ℹ 51 more rows
## # ℹ 1 more variable: sex <chr>

6. Identify the dependent & independent variables and use reshaping techniques and create a new data frame by joining those variables from your dataset.

new_penguins = cbind(penguins_with_kg$species, penguins_with_kg$sex, penguins_with_kg$body_mass_kg)
new_penguins = as.data.frame(new_penguins)
names(new_penguins)[1]="species"
names(new_penguins)[2]="sex"
names(new_penguins)[3]="kg"
head(new_penguins, 20)
##    species    sex    kg
## 1   Adelie   MALE  3.75
## 2   Adelie FEMALE   3.8
## 3   Adelie FEMALE  3.25
## 4   Adelie   <NA>  <NA>
## 5   Adelie FEMALE  3.45
## 6   Adelie   MALE  3.65
## 7   Adelie FEMALE 3.625
## 8   Adelie   MALE 4.675
## 9   Adelie   <NA> 3.475
## 10  Adelie   <NA>  4.25
## 11  Adelie   <NA>   3.3
## 12  Adelie   <NA>   3.7
## 13  Adelie FEMALE   3.2
## 14  Adelie   MALE   3.8
## 15  Adelie   MALE   4.4
## 16  Adelie FEMALE   3.7
## 17  Adelie FEMALE  3.45
## 18  Adelie   MALE   4.5
## 19  Adelie FEMALE 3.325
## 20  Adelie   MALE   4.2

7. Remove missing values in your dataset.

penguins_clean <- na.omit(penguins_with_kg)
head(penguins_clean, 344)
## # A tibble: 334 × 8
##    species island culmen_length_mm culmen_depth_mm flipper_length_mm body_mass_g
##    <chr>   <chr>             <dbl>           <dbl>             <dbl>       <dbl>
##  1 Adelie  Torge…             39.1            18.7               181        3750
##  2 Adelie  Torge…             39.5            17.4               186        3800
##  3 Adelie  Torge…             40.3            18                 195        3250
##  4 Adelie  Torge…             36.7            19.3               193        3450
##  5 Adelie  Torge…             39.3            20.6               190        3650
##  6 Adelie  Torge…             38.9            17.8               181        3625
##  7 Adelie  Torge…             39.2            19.6               195        4675
##  8 Adelie  Torge…             41.1            17.6               182        3200
##  9 Adelie  Torge…             38.6            21.2               191        3800
## 10 Adelie  Torge…             34.6            21.1               198        4400
## # ℹ 324 more rows
## # ℹ 2 more variables: sex <chr>, body_mass_kg <dbl>

8. dentify and remove duplicated data in your dataset

penguins_unique <- penguins_clean[!duplicated(penguins_clean), ]
head(penguins_unique,344)
## # A tibble: 334 × 8
##    species island culmen_length_mm culmen_depth_mm flipper_length_mm body_mass_g
##    <chr>   <chr>             <dbl>           <dbl>             <dbl>       <dbl>
##  1 Adelie  Torge…             39.1            18.7               181        3750
##  2 Adelie  Torge…             39.5            17.4               186        3800
##  3 Adelie  Torge…             40.3            18                 195        3250
##  4 Adelie  Torge…             36.7            19.3               193        3450
##  5 Adelie  Torge…             39.3            20.6               190        3650
##  6 Adelie  Torge…             38.9            17.8               181        3625
##  7 Adelie  Torge…             39.2            19.6               195        4675
##  8 Adelie  Torge…             41.1            17.6               182        3200
##  9 Adelie  Torge…             38.6            21.2               191        3800
## 10 Adelie  Torge…             34.6            21.1               198        4400
## # ℹ 324 more rows
## # ℹ 2 more variables: sex <chr>, body_mass_kg <dbl>

9. Reorder multiple rows in descending order

penguins_clean %>% arrange(desc(species), desc(body_mass_kg))
## # A tibble: 334 × 8
##    species island culmen_length_mm culmen_depth_mm flipper_length_mm body_mass_g
##    <chr>   <chr>             <dbl>           <dbl>             <dbl>       <dbl>
##  1 Gentoo  Biscoe             49.2            15.2               221        6300
##  2 Gentoo  Biscoe             59.6            17                 230        6050
##  3 Gentoo  Biscoe             51.1            16.3               220        6000
##  4 Gentoo  Biscoe             48.8            16.2               222        6000
##  5 Gentoo  Biscoe             45.2            16.4               223        5950
##  6 Gentoo  Biscoe             49.8            15.9               229        5950
##  7 Gentoo  Biscoe             48.4            14.6               213        5850
##  8 Gentoo  Biscoe             49.3            15.7               217        5850
##  9 Gentoo  Biscoe             55.1            16                 230        5850
## 10 Gentoo  Biscoe             49.5            16.2               229        5800
## # ℹ 324 more rows
## # ℹ 2 more variables: sex <chr>, body_mass_kg <dbl>

10. Rename some of the column names in your dataset

names(penguins_unique)[8]="kilograms"
names(penguins_unique)[6]="grams"
names(penguins_unique)[3]="length(mm)"
names(penguins_unique)[4]="depth(mm)"
head(penguins_unique)
## # A tibble: 6 × 8
##   species island    `length(mm)` `depth(mm)` flipper_length_mm grams sex   
##   <chr>   <chr>            <dbl>       <dbl>             <dbl> <dbl> <chr> 
## 1 Adelie  Torgersen         39.1        18.7               181  3750 MALE  
## 2 Adelie  Torgersen         39.5        17.4               186  3800 FEMALE
## 3 Adelie  Torgersen         40.3        18                 195  3250 FEMALE
## 4 Adelie  Torgersen         36.7        19.3               193  3450 FEMALE
## 5 Adelie  Torgersen         39.3        20.6               190  3650 MALE  
## 6 Adelie  Torgersen         38.9        17.8               181  3625 FEMALE
## # ℹ 1 more variable: kilograms <dbl>

11. Add new variables in your data frame by using a mathematical function (for e.g. – multiply an existing column by 2 and add it as a new variable to your data frame)

penguins_unique$bill_ratio = penguins_unique$`length(mm)` / penguins_unique$`depth(mm)`

head(penguins_unique)
## # A tibble: 6 × 9
##   species island    `length(mm)` `depth(mm)` flipper_length_mm grams sex   
##   <chr>   <chr>            <dbl>       <dbl>             <dbl> <dbl> <chr> 
## 1 Adelie  Torgersen         39.1        18.7               181  3750 MALE  
## 2 Adelie  Torgersen         39.5        17.4               186  3800 FEMALE
## 3 Adelie  Torgersen         40.3        18                 195  3250 FEMALE
## 4 Adelie  Torgersen         36.7        19.3               193  3450 FEMALE
## 5 Adelie  Torgersen         39.3        20.6               190  3650 MALE  
## 6 Adelie  Torgersen         38.9        17.8               181  3625 FEMALE
## # ℹ 2 more variables: kilograms <dbl>, bill_ratio <dbl>

12. Create a training set using random number generator engine.

set.seed(123) 
training_set = penguins_unique %>% sample_frac(0.05, replace = FALSE)
head(training_set,334)
## # A tibble: 17 × 9
##    species   island    `length(mm)` `depth(mm)` flipper_length_mm grams sex   
##    <chr>     <chr>            <dbl>       <dbl>             <dbl> <dbl> <chr> 
##  1 Chinstrap Dream             42.5        16.7               187  3350 FEMALE
##  2 Adelie    Torgersen         34.4        18.4               184  3325 FEMALE
##  3 Chinstrap Dream             51.5        18.7               187  3250 MALE  
##  4 Gentoo    Biscoe            49.5        16.1               224  5650 MALE  
##  5 Adelie    Torgersen         41.4        18.5               202  3875 MALE  
##  6 Gentoo    Biscoe            43.4        14.4               218  4600 FEMALE
##  7 Gentoo    Biscoe            45.8        14.6               210  4200 FEMALE
##  8 Gentoo    Biscoe            47.3        15.3               222  5250 MALE  
##  9 Gentoo    Biscoe            45.2        14.8               212  5200 FEMALE
## 10 Chinstrap Dream             46.1        18.2               178  3250 FEMALE
## 11 Adelie    Dream             40.8        18.9               208  4300 MALE  
## 12 Adelie    Dream             38.1        18.6               190  3700 FEMALE
## 13 Gentoo    Biscoe            45.3        13.7               210  4300 FEMALE
## 14 Chinstrap Dream             48.1        16.4               199  3325 FEMALE
## 15 Gentoo    Biscoe            49.8        15.9               229  5950 MALE  
## 16 Adelie    Dream             32.1        15.5               188  3050 FEMALE
## 17 Adelie    Dream             39.5        16.7               178  3250 FEMALE
## # ℹ 2 more variables: kilograms <dbl>, bill_ratio <dbl>

14. Use any of the numerical variables from the dataset and perform the statistical functions

target_variable <- penguins_unique$grams

# Mean
mean_value <- mean(target_variable, na.rm = TRUE)
cat("Mean grams:", mean_value, "\n")
## Mean grams: 4209.057
# Median
median_value <- median(target_variable, na.rm = TRUE)
cat("Median grams:", median_value, "\n")
## Median grams: 4050
# Mode 
freq_table <- table(target_variable)
mode_value <- names(freq_table)[which.max(freq_table)]
cat("Mode grams:", mode_value, "\n")
## Mode grams: 3800
# Range 
range_value <- range(target_variable, na.rm = TRUE)
cat("Range output:", range_value, "\n")
## Range output: 2700 6300

15. Plot a scatter plot for any 2 variables in your dataset

ggplot(data = penguins_unique, aes(x = flipper_length_mm, y = grams)) +
  geom_point()

16. Plot a bar plot for any 2 variables in your dataset

ggplot(data = penguins_unique,aes(x = species))+geom_bar(fill = "aquamarine4")

17. Find the correlation between any 2 variables by applying Pearson correlation

cor(penguins_unique$flipper_length_mm, penguins_unique$kilograms, method="pearson")
## [1] 0.873211