#loading data sets mtcars
#######QUESTION 1
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.2
## Warning: package 'ggplot2' was built under R version 4.3.2
## Warning: package 'tibble' was built under R version 4.3.2
## Warning: package 'tidyr' was built under R version 4.3.2
## Warning: package 'readr' was built under R version 4.3.2
## Warning: package 'purrr' was built under R version 4.3.2
## Warning: package 'dplyr' was built under R version 4.3.2
## Warning: package 'stringr' was built under R version 4.3.2
## Warning: package 'forcats' was built under R version 4.3.2
## Warning: package 'lubridate' was built under R version 4.3.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
data(mtcars)
#view my data
View(mtcars)
#calculation of mean, median, standard deviation,range for mpg
#mtcars %>% summarise(mean(mpg) ,median(mpg) ,sd(mpg) ,range(mpg))
mtcars %>% reframe(mean(mpg) ,median(mpg) ,sd(mpg) ,range(mpg))
## mean(mpg) median(mpg) sd(mpg) range(mpg)
## 1 20.09062 19.2 6.026948 10.4
## 2 20.09062 19.2 6.026948 33.9
# variables and observation in the mtcars
ncol(mtcars)
## [1] 11
nrow(mtcars)
## [1] 32
count(mtcars)
## n
## 1 32
#filter mtcars when mpg > 20
filter_mtcars <- filter(mtcars , mpg > 20)
filter_mtcars
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
## Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
#show the only few first of results c
head(filter_mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
# average mpg for each unique value in the "cyl"
# Calculate average mpg for each unique 'cyl' value using dplyr
avg_mpg_by_cyl <- mtcars %>% group_by(cyl) %>% summarise(avg_mpg = mean(mpg))
avg_mpg_by_cyl
## # A tibble: 3 × 2
## cyl avg_mpg
## <dbl> <dbl>
## 1 4 26.7
## 2 6 19.7
## 3 8 15.1
######QUESTION 2
#loading the datasets
data("airquality")
# filter
filter_airquality <- filter(airquality, Month == 5)
print(filter_airquality)
## Ozone Solar.R Wind Temp Month Day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
## 4 18 313 11.5 62 5 4
## 5 NA NA 14.3 56 5 5
## 6 28 NA 14.9 66 5 6
## 7 23 299 8.6 65 5 7
## 8 19 99 13.8 59 5 8
## 9 8 19 20.1 61 5 9
## 10 NA 194 8.6 69 5 10
## 11 7 NA 6.9 74 5 11
## 12 16 256 9.7 69 5 12
## 13 11 290 9.2 66 5 13
## 14 14 274 10.9 68 5 14
## 15 18 65 13.2 58 5 15
## 16 14 334 11.5 64 5 16
## 17 34 307 12.0 66 5 17
## 18 6 78 18.4 57 5 18
## 19 30 322 11.5 68 5 19
## 20 11 44 9.7 62 5 20
## 21 1 8 9.7 59 5 21
## 22 11 320 16.6 73 5 22
## 23 4 25 9.7 61 5 23
## 24 32 92 12.0 61 5 24
## 25 NA 66 16.6 57 5 25
## 26 NA 266 14.9 58 5 26
## 27 NA NA 8.0 57 5 27
## 28 23 13 12.0 67 5 28
## 29 45 252 14.9 81 5 29
## 30 115 223 5.7 79 5 30
## 31 37 279 7.4 76 5 31
### average Ozone level for that month
average_ozone_may <- mean(filter_airquality$Ozone, na.rm = TRUE)
average_ozone_may
## [1] 23.61538
##### QUETION 3
#### randomly select 20% of the row
data(iris)
sampled_data <- iris[sample(nrow(iris), 0.2 * nrow(iris)),]
sampled_data
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 15 5.8 4.0 1.2 0.2 setosa
## 82 5.5 2.4 3.7 1.0 versicolor
## 69 6.2 2.2 4.5 1.5 versicolor
## 43 4.4 3.2 1.3 0.2 setosa
## 52 6.4 3.2 4.5 1.5 versicolor
## 129 6.4 2.8 5.6 2.1 virginica
## 98 6.2 2.9 4.3 1.3 versicolor
## 71 5.9 3.2 4.8 1.8 versicolor
## 135 6.1 2.6 5.6 1.4 virginica
## 61 5.0 2.0 3.5 1.0 versicolor
## 45 5.1 3.8 1.9 0.4 setosa
## 65 5.6 2.9 3.6 1.3 versicolor
## 5 5.0 3.6 1.4 0.2 setosa
## 16 5.7 4.4 1.5 0.4 setosa
## 136 7.7 3.0 6.1 2.3 virginica
## 73 6.3 2.5 4.9 1.5 versicolor
## 74 6.1 2.8 4.7 1.2 versicolor
## 10 4.9 3.1 1.5 0.1 setosa
## 149 6.2 3.4 5.4 2.3 virginica
## 9 4.4 2.9 1.4 0.2 setosa
## 27 5.0 3.4 1.6 0.4 setosa
## 95 5.6 2.7 4.2 1.3 versicolor
## 117 6.5 3.0 5.5 1.8 virginica
## 3 4.7 3.2 1.3 0.2 setosa
## 41 5.0 3.5 1.3 0.3 setosa
## 28 5.2 3.5 1.5 0.2 setosa
## 128 6.1 3.0 4.9 1.8 virginica
## 49 5.3 3.7 1.5 0.2 setosa
## 31 4.8 3.1 1.6 0.2 setosa
## 8 5.0 3.4 1.5 0.2 setosa
##### calculate mean of sepal.length
mean_sepal_length <- mean(sampled_data$Sepal.Length)
mean_sepal_length
## [1] 5.603333
##### QUESTION 4
#### loarding dataset
data("diamonds")
### selest only columns 'carat','cut', 'price'
selected_data <- diamonds %>% select(carat,cut,price)
selected_data
## # A tibble: 53,940 × 3
## carat cut price
## <dbl> <ord> <int>
## 1 0.23 Ideal 326
## 2 0.21 Premium 326
## 3 0.23 Good 327
## 4 0.29 Premium 334
## 5 0.31 Good 335
## 6 0.24 Very Good 336
## 7 0.24 Very Good 336
## 8 0.26 Very Good 337
## 9 0.22 Fair 337
## 10 0.23 Very Good 338
## # ℹ 53,930 more rows
### filter rows where 'carat' is greater tha 1
filtered_data <- selected_data %>% filter(carat > 1)
filtered_data
## # A tibble: 17,502 × 3
## carat cut price
## <dbl> <ord> <int>
## 1 1.17 Very Good 2774
## 2 1.01 Premium 2781
## 3 1.01 Fair 2788
## 4 1.01 Premium 2788
## 5 1.05 Very Good 2789
## 6 1.05 Fair 2789
## 7 1.01 Fair 2797
## 8 1.04 Premium 2801
## 9 1.2 Fair 2809
## 10 1.02 Premium 2815
## # ℹ 17,492 more rows
### Arrange the data based on 'price' in descending order
arranged_data <- filtered_data %>% arrange (desc(price))
arranged_data
## # A tibble: 17,502 × 3
## carat cut price
## <dbl> <ord> <int>
## 1 2.29 Premium 18823
## 2 2 Very Good 18818
## 3 1.51 Ideal 18806
## 4 2.07 Ideal 18804
## 5 2 Very Good 18803
## 6 2.29 Premium 18797
## 7 2.04 Premium 18795
## 8 2 Premium 18795
## 9 1.71 Premium 18791
## 10 2.15 Ideal 18791
## # ℹ 17,492 more rows
##### QUESTION 5
## scatter plot for the 'sepal.lenth',and 'sepal.width' variables in the 'iris' dataset
library(ggplot2)
data(iris)
scatter_plot <- ggplot(iris, aes(x = Sepal.Length, y = Sepal.Width, color = Species)) +
geom_point() +
labs(title = "Scatter Plot of Sepal Length and Sepal Width",
x = "Sepal Length", y = "Sepal Width")
print(scatter_plot)

#### QUESTION 6
## bar plot showing the average miles per gallon("mpg") for each level of the "cyl"variable in the matcars dataset
library(ggplot2)
avg_mpg <- aggregate(mpg ~ cyl, data = mtcars, FUN = mean)
bar_plot <- ggplot(avg_mpg, aes(x = factor(cyl), y = mpg)) +
geom_bar(stat = "identity", fill = "skyblue") +
labs(title = "Average MPG by Number of Cylinders",
x = "Number of Cylinders", y = "Average MPG")
print(bar_plot)

#### QUESTION 7
### . Plot a line chart showing the trend in the "unemploy" variable over time in the “economics” dataset.
library(ggplot2)
data(economics)
line_chart <- ggplot(economics, aes(x = date, y = unemploy)) +
geom_line(color = "blue") +
labs(title = "Unemployment Over Time",
x = "Date", y = "Unemployment")
print(line_chart)
