#loading data sets  mtcars
#######QUESTION 1
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.2
## Warning: package 'ggplot2' was built under R version 4.3.2
## Warning: package 'tibble' was built under R version 4.3.2
## Warning: package 'tidyr' was built under R version 4.3.2
## Warning: package 'readr' was built under R version 4.3.2
## Warning: package 'purrr' was built under R version 4.3.2
## Warning: package 'dplyr' was built under R version 4.3.2
## Warning: package 'stringr' was built under R version 4.3.2
## Warning: package 'forcats' was built under R version 4.3.2
## Warning: package 'lubridate' was built under R version 4.3.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
data(mtcars)
#view my data 
View(mtcars)
#calculation of mean, median, standard deviation,range for mpg
#mtcars %>% summarise(mean(mpg) ,median(mpg) ,sd(mpg) ,range(mpg))
mtcars %>% reframe(mean(mpg) ,median(mpg) ,sd(mpg) ,range(mpg))
##   mean(mpg) median(mpg)  sd(mpg) range(mpg)
## 1  20.09062        19.2 6.026948       10.4
## 2  20.09062        19.2 6.026948       33.9
# variables and observation in the mtcars
ncol(mtcars)
## [1] 11
nrow(mtcars)
## [1] 32
count(mtcars)
##    n
## 1 32
#filter mtcars when mpg > 20
filter_mtcars <- filter(mtcars , mpg > 20)
filter_mtcars
##                 mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4      21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag  21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710     22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive 21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
## Merc 240D      24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
## Merc 230       22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
## Fiat 128       32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
## Honda Civic    30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2
## Toyota Corolla 33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
## Toyota Corona  21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1
## Fiat X1-9      27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1
## Porsche 914-2  26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2
## Lotus Europa   30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2
## Volvo 142E     21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2
#show the only few first of results c 
head(filter_mtcars)
##                 mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4      21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag  21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710     22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive 21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
## Merc 240D      24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
## Merc 230       22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
# average mpg for each unique value in the "cyl"
# Calculate average mpg for each unique 'cyl' value using dplyr
avg_mpg_by_cyl <- mtcars %>%  group_by(cyl) %>% summarise(avg_mpg = mean(mpg))
 avg_mpg_by_cyl 
## # A tibble: 3 × 2
##     cyl avg_mpg
##   <dbl>   <dbl>
## 1     4    26.7
## 2     6    19.7
## 3     8    15.1
######QUESTION 2
#loading  the datasets
 data("airquality")
 # filter 
 filter_airquality <- filter(airquality, Month == 5)
 print(filter_airquality)
##    Ozone Solar.R Wind Temp Month Day
## 1     41     190  7.4   67     5   1
## 2     36     118  8.0   72     5   2
## 3     12     149 12.6   74     5   3
## 4     18     313 11.5   62     5   4
## 5     NA      NA 14.3   56     5   5
## 6     28      NA 14.9   66     5   6
## 7     23     299  8.6   65     5   7
## 8     19      99 13.8   59     5   8
## 9      8      19 20.1   61     5   9
## 10    NA     194  8.6   69     5  10
## 11     7      NA  6.9   74     5  11
## 12    16     256  9.7   69     5  12
## 13    11     290  9.2   66     5  13
## 14    14     274 10.9   68     5  14
## 15    18      65 13.2   58     5  15
## 16    14     334 11.5   64     5  16
## 17    34     307 12.0   66     5  17
## 18     6      78 18.4   57     5  18
## 19    30     322 11.5   68     5  19
## 20    11      44  9.7   62     5  20
## 21     1       8  9.7   59     5  21
## 22    11     320 16.6   73     5  22
## 23     4      25  9.7   61     5  23
## 24    32      92 12.0   61     5  24
## 25    NA      66 16.6   57     5  25
## 26    NA     266 14.9   58     5  26
## 27    NA      NA  8.0   57     5  27
## 28    23      13 12.0   67     5  28
## 29    45     252 14.9   81     5  29
## 30   115     223  5.7   79     5  30
## 31    37     279  7.4   76     5  31
 ### average Ozone level for that month
 average_ozone_may <- mean(filter_airquality$Ozone, na.rm = TRUE)
average_ozone_may 
## [1] 23.61538
##### QUETION 3
#### randomly select 20% of the row
data(iris)
sampled_data <- iris[sample(nrow(iris), 0.2 * nrow(iris)),]
sampled_data
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
## 15           5.8         4.0          1.2         0.2     setosa
## 82           5.5         2.4          3.7         1.0 versicolor
## 69           6.2         2.2          4.5         1.5 versicolor
## 43           4.4         3.2          1.3         0.2     setosa
## 52           6.4         3.2          4.5         1.5 versicolor
## 129          6.4         2.8          5.6         2.1  virginica
## 98           6.2         2.9          4.3         1.3 versicolor
## 71           5.9         3.2          4.8         1.8 versicolor
## 135          6.1         2.6          5.6         1.4  virginica
## 61           5.0         2.0          3.5         1.0 versicolor
## 45           5.1         3.8          1.9         0.4     setosa
## 65           5.6         2.9          3.6         1.3 versicolor
## 5            5.0         3.6          1.4         0.2     setosa
## 16           5.7         4.4          1.5         0.4     setosa
## 136          7.7         3.0          6.1         2.3  virginica
## 73           6.3         2.5          4.9         1.5 versicolor
## 74           6.1         2.8          4.7         1.2 versicolor
## 10           4.9         3.1          1.5         0.1     setosa
## 149          6.2         3.4          5.4         2.3  virginica
## 9            4.4         2.9          1.4         0.2     setosa
## 27           5.0         3.4          1.6         0.4     setosa
## 95           5.6         2.7          4.2         1.3 versicolor
## 117          6.5         3.0          5.5         1.8  virginica
## 3            4.7         3.2          1.3         0.2     setosa
## 41           5.0         3.5          1.3         0.3     setosa
## 28           5.2         3.5          1.5         0.2     setosa
## 128          6.1         3.0          4.9         1.8  virginica
## 49           5.3         3.7          1.5         0.2     setosa
## 31           4.8         3.1          1.6         0.2     setosa
## 8            5.0         3.4          1.5         0.2     setosa
##### calculate mean of sepal.length
mean_sepal_length <- mean(sampled_data$Sepal.Length)
mean_sepal_length
## [1] 5.603333
##### QUESTION 4
#### loarding dataset
data("diamonds")
### selest only columns 'carat','cut', 'price'
selected_data <- diamonds %>% select(carat,cut,price)
selected_data
## # A tibble: 53,940 × 3
##    carat cut       price
##    <dbl> <ord>     <int>
##  1  0.23 Ideal       326
##  2  0.21 Premium     326
##  3  0.23 Good        327
##  4  0.29 Premium     334
##  5  0.31 Good        335
##  6  0.24 Very Good   336
##  7  0.24 Very Good   336
##  8  0.26 Very Good   337
##  9  0.22 Fair        337
## 10  0.23 Very Good   338
## # ℹ 53,930 more rows
### filter rows where 'carat' is greater tha 1
filtered_data <- selected_data %>% filter(carat > 1)
filtered_data
## # A tibble: 17,502 × 3
##    carat cut       price
##    <dbl> <ord>     <int>
##  1  1.17 Very Good  2774
##  2  1.01 Premium    2781
##  3  1.01 Fair       2788
##  4  1.01 Premium    2788
##  5  1.05 Very Good  2789
##  6  1.05 Fair       2789
##  7  1.01 Fair       2797
##  8  1.04 Premium    2801
##  9  1.2  Fair       2809
## 10  1.02 Premium    2815
## # ℹ 17,492 more rows
### Arrange the data based on 'price' in descending order
arranged_data <- filtered_data %>%  arrange (desc(price))
arranged_data
## # A tibble: 17,502 × 3
##    carat cut       price
##    <dbl> <ord>     <int>
##  1  2.29 Premium   18823
##  2  2    Very Good 18818
##  3  1.51 Ideal     18806
##  4  2.07 Ideal     18804
##  5  2    Very Good 18803
##  6  2.29 Premium   18797
##  7  2.04 Premium   18795
##  8  2    Premium   18795
##  9  1.71 Premium   18791
## 10  2.15 Ideal     18791
## # ℹ 17,492 more rows
##### QUESTION 5
## scatter plot for the 'sepal.lenth',and 'sepal.width' variables in the 'iris' dataset
library(ggplot2)
data(iris)
scatter_plot <- ggplot(iris, aes(x = Sepal.Length, y = Sepal.Width, color = Species)) +
  geom_point() +
  labs(title = "Scatter Plot of Sepal Length and Sepal Width",
       x = "Sepal Length", y = "Sepal Width")
print(scatter_plot)

####  QUESTION 6
## bar plot showing the average miles per gallon("mpg") for each level of the "cyl"variable in  the matcars dataset
library(ggplot2)
avg_mpg <- aggregate(mpg ~ cyl, data = mtcars, FUN = mean)
bar_plot <- ggplot(avg_mpg, aes(x = factor(cyl), y = mpg)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  labs(title = "Average MPG by Number of Cylinders",
       x = "Number of Cylinders", y = "Average MPG")
print(bar_plot)

#### QUESTION 7
### . Plot a line chart showing the trend in the "unemploy" variable over time in the “economics” dataset.
library(ggplot2)
data(economics)
line_chart <- ggplot(economics, aes(x = date, y = unemploy)) +
  geom_line(color = "blue") +
  labs(title = "Unemployment Over Time",
       x = "Date", y = "Unemployment")
print(line_chart)