? `datasets-package`
## starting httpd help server ... done
library(help = "datasets")
data("iris")
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
years <- c(1980, 1980, 1985, 1990)
scores<- c( 34, 44, 56, 83)

df <- data.frame(years, scores)

df
##   years scores
## 1  1980     34
## 2  1980     44
## 3  1985     56
## 4  1990     83
df[,1]
## [1] 1980 1980 1985 1990
df[,2]
## [1] 34 44 56 83
df$years
## [1] 1980 1980 1985 1990
df$scores
## [1] 34 44 56 83
df[df$scores<50,]
##   years scores
## 1  1980     34
## 2  1980     44
df[df$year==1980, "scores"]
## [1] 34 44
#Subsetting
df[df$scores<50]
##   years scores
## 1  1980     34
## 2  1980     44
## 3  1985     56
## 4  1990     83
#Adding new Attribute
df$Age<-c("..")
df
##   years scores Age
## 1  1980     34  ..
## 2  1980     44  ..
## 3  1985     56  ..
## 4  1990     83  ..
#creating a Data frame
subject_name <-c("John doe","james doe","Steve Graves")
Temperature <-c(98.1,98.6,101.4)
flu_status <-c(FALSE,FALSE,FALSE)

df<-data.frame(subject_name,Temperature,flu_status)
print(df)
##   subject_name Temperature flu_status
## 1     John doe        98.1      FALSE
## 2    james doe        98.6      FALSE
## 3 Steve Graves       101.4      FALSE
# Toy medical test
subject_name <- c("John Doe", "James Doe", "Steve Graves")
Temperature <- c(98.1, 98.6, 101.4)
flu_status <- c(FALSE, FALSE, TRUE)
df <- data.frame(subject_name, Temperature, flu_status)
print(df)
##   subject_name Temperature flu_status
## 1     John Doe        98.1      FALSE
## 2    James Doe        98.6      FALSE
## 3 Steve Graves       101.4       TRUE
blood<-factor(c("O","AB","A"),levels=c("A","AB","O"))
blood
## [1] O  AB A 
## Levels: A AB O
# to access the positon of each patient on the data set
Temperature[2]
## [1] 98.6
Temperature[2:3]
## [1]  98.6 101.4
Temperature[-2] 
## [1]  98.1 101.4
Temperature[-3]
## [1] 98.1 98.6
#A factor is a kind of vector that store categorical data or ordinal varibale 
#reprstented as # factor( )

# creating a factor from a charcater vector
gender<-factor(c("male","Female","male"))
gender
## [1] male   Female male  
## Levels: Female male
# creating an ordinal factor 
#an ordinal factor is a kind  of factor where the levles have a natural ordering  or heirachy 
# an ordinal factor is represented as factor ( ) and sepcifyng the levels in desired order
 education <- factor(c( "highschool","college","Graduate"))
 levels=c("highschool","college","graduate")
 education
## [1] highschool college    Graduate  
## Levels: college Graduate highschool
 education <- ordered(c( "highschool","college","graduate"))
 levels=c("highschool","college","graduate")
 education
## [1] highschool college    graduate  
## Levels: college < graduate < highschool
symptoms <-factor (c("severe","mild","moderate"),levels=c("mild","moderate","severe"),ordered=TRUE)
symptoms
## [1] severe   mild     moderate
## Levels: mild < moderate < severe
#To Test wether each patients symptoms are more severe and moderate:
symptoms>"moderate"
## [1]  TRUE FALSE FALSE
#instead of displaying the patient name,temperature ,flu_status one after the order in ths way:
subject_name[1]
## [1] "John Doe"
Temperature[1]
## [1] 98.1
flu_status[1]
## [1] FALSE
gender[1]
## [1] male
## Levels: Female male
blood[1]
## [1] O
## Levels: A AB O
symptoms[1]
## [1] severe
## Levels: mild < moderate < severe
#creating a data frame for this  information 
pt_data <-data.frame(subject_name,Temperature,flu_status,gender,blood,symptoms,stringsAsFactors = FALSE)
pt_data
##   subject_name Temperature flu_status gender blood symptoms
## 1     John Doe        98.1      FALSE   male     O   severe
## 2    James Doe        98.6      FALSE Female    AB     mild
## 3 Steve Graves       101.4       TRUE   male     A moderate
#Accessng Data in the  data frame
pt_data$subject_name
## [1] "John Doe"     "James Doe"    "Steve Graves"
#To extract the value in the first row and second column  of the patient data frame
pt_data[1,2]
## [1] 98.1
#To refer to every row or every column ,simply leave the row or column portion blank 
#TO EXTRACT LL ROWS OF THE FIRST COLUMN
pt_data[,1]
## [1] "John Doe"     "James Doe"    "Steve Graves"
#to pull data of the first row & third row second & fourth column 
pt_data[c(1,3),c(2,4)]
##   Temperature gender
## 1        98.1   male
## 3       101.4   male
#to extract every information
pt_data[,]
##   subject_name Temperature flu_status gender blood symptoms
## 1     John Doe        98.1      FALSE   male     O   severe
## 2    James Doe        98.6      FALSE Female    AB     mild
## 3 Steve Graves       101.4       TRUE   male     A moderate
#note that columns are better accessed by name rather than positions
pt_data[c(1,3),c("Temperature","gender")]
##   Temperature gender
## 1        98.1   male
## 3       101.4   male
#creating new columns in a dataframe
pt_data$temp_c<-(pt_data$Temperature-32)*(5/9)
pt_data
##   subject_name Temperature flu_status gender blood symptoms   temp_c
## 1     John Doe        98.1      FALSE   male     O   severe 36.72222
## 2    James Doe        98.6      FALSE Female    AB     mild 37.00000
## 3 Steve Graves       101.4       TRUE   male     A moderate 38.55556
#checking on our added column and to confirm  the calculation worked
#lets compare the new celsius -based temp_c column to the previous Farenheit-scale temperature column:
pt_data[c("Temperature","temp_c")]
##   Temperature   temp_c
## 1        98.1 36.72222
## 2        98.6 37.00000
## 3       101.4 38.55556
#creating a data frame for this  information 
pt_data <-data.frame(subject_name,Temperature,flu_status,gender,blood,symptoms,stringsAsFactors = FALSE)
pt_data
##   subject_name Temperature flu_status gender blood symptoms
## 1     John Doe        98.1      FALSE   male     O   severe
## 2    James Doe        98.6      FALSE Female    AB     mild
## 3 Steve Graves       101.4       TRUE   male     A moderate
#Accessng Data in the  data frame
pt_data$subject_name
## [1] "John Doe"     "James Doe"    "Steve Graves"
#To extract the value in the first row and second column  of the patient data frame
pt_data[1,2]
## [1] 98.1
#To refer to every row or every column ,simply leave the row or column portion blank 
#TO EXTRACT LL ROWS OF THE FIRST COLUMN
pt_data[,1]
## [1] "John Doe"     "James Doe"    "Steve Graves"
#to pull data of the first row & third row second & fourth column 
pt_data[c(1,3),c(2,4)]
##   Temperature gender
## 1        98.1   male
## 3       101.4   male
#to extract every information
pt_data[,]
##   subject_name Temperature flu_status gender blood symptoms
## 1     John Doe        98.1      FALSE   male     O   severe
## 2    James Doe        98.6      FALSE Female    AB     mild
## 3 Steve Graves       101.4       TRUE   male     A moderate
#note that columns are better accessed by name rather than positions
pt_data[c(1,3),c("Temperature","gender")]
##   Temperature gender
## 1        98.1   male
## 3       101.4   male
#creating new columns in a dataframe
pt_data$temp_c<-(pt_data$Temperature-32)*(5/9)
pt_data
##   subject_name Temperature flu_status gender blood symptoms   temp_c
## 1     John Doe        98.1      FALSE   male     O   severe 36.72222
## 2    James Doe        98.6      FALSE Female    AB     mild 37.00000
## 3 Steve Graves       101.4       TRUE   male     A moderate 38.55556
#checking on our added column and to confirm  the calculation worked
#lets compare the new celsius -based temp_c column to the previous Farenheit-scale temperature column:
pt_data[c("Temperature","temp_c")]
##   Temperature   temp_c
## 1        98.1 36.72222
## 2        98.6 37.00000
## 3       101.4 38.55556
#we need to load the required libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(dplyr)
install.packages("tidyverse")
## Warning: package 'tidyverse' is in use and will not be installed
# this to confirm if ggplot is available
ggplot()

#so to plot mpg ,we run the below code to put(display) displ
#on x axis and y axis hwy(higwh way miles) 
# so if you get error ,you have o first run gplot library ,then relod the below code
ggplot(data=mpg)+geom_point(mapping=aes(x=displ,y=hwy))

#this to confirm if ggplot is available
ggplot()

#so to plot mpg ,we run the below code to put(display) displ
#on x axis and y axis hwy(higwh way miles) 
# so if you get error ,you have o first run gplot library ,then relod the below code
ggplot(data=mpg)+geom_point(mapping=aes(x=displ,y=hwy))

# diplaying ggplot for displacement and city milage
ggplot(data=mpg)+geom_point(mapping=aes(x=displ,y=cty))

# displaying ggplot for dsialacement and number of cylinders
ggplot(data=mpg)+geom_point(mapping=aes(x=displ,y=cyl))

# displaying ggplot and geom_smooth for dsialacement and number of cylinders
# geom_smooth is use to add  a smooth line  or curve to a plot
ggplot(data=mpg)+geom_smooth(mapping=aes(x=displ,y=cyl))
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

#diplaying ggplot for displacement and hwy and class
ggplot(data=mpg)+geom_point(mapping=aes(x=displ,y=hwy, color=class))

#diplaying ggplot for displacement and hwy and class, just making the point green
ggplot(data=mpg)+geom_point(mapping=aes(x=displ,y=hwy), color="green")

#diplaying ggplot for displacement and hwy and class, just making the point green
ggplot(data=mpg)+geom_point(mapping=aes(x=displ,y=hwy, shape=class))
## Warning: The shape palette can deal with a maximum of 6 discrete values because more
## than 6 becomes difficult to discriminate
## ℹ you have requested 7 values. Consider specifying shapes manually if you need
##   that many have them.
## Warning: Removed 62 rows containing missing values (`geom_point()`).

#diplaying ggplot for displacement and hwy and class size,
#This means that points belonging to different classes will have different sizes.
ggplot(data=mpg)+geom_point(mapping=aes(x=displ,y=hwy, size=class))
## Warning: Using size for a discrete variable is not advised.

#diplaying ggplot for displacement and hwy and class size,
#different classes will have different transparency levels.
ggplot(data=mpg)+geom_point(mapping=aes(x=displ,y=hwy, alpha=class))
## Warning: Using alpha for a discrete variable is not advised.

###Using Facet and subplot
#Faceting: Faceting is the process of breaking data into subsets and displaying those 
#subsets in separate panels (also known as small multiples) within the same plot. 
#Faceting is typically done based on one or more categorical variables.
#Each panel shows a different subset of the data, making it easier to compare different groups or categories.

#facet_wrap(~ class, nrow = 2) splits the data into panels based on the "class" variable,
#with two rows of panels. Each panel represents a different vehicle class.
ggplot(data=mpg)+geom_point(mapping=aes(x=displ,y=hwy))+facet_wrap(~class,nrow=2)

#facet_wrap(~ class, nrow = 3) splits the data into panels based on the "class" variable,
#with three  rows of panels. Each panel represents a different vehicle class.
#note class is a variable in the mpg dataset that categorizes vehicles into different classes such
#as "subcompact", "compact", "midsize", etc.

#When you use facet_wrap(~ class) or facet_grid(class ~ .) in ggplot2, 
#you're telling ggplot2 to create separate panels for each level of the "class" variable. 
ggplot(data=mpg)+geom_point(mapping=aes(x=displ,y=hwy))+facet_wrap(~class,nrow=3)

#When you use facet_wrap(~ class) or facet_grid(class ~ .) in ggplot2, 
#you're telling ggplot2 to create separate panels for each level of the "class" variable
ggplot(data=mpg)+geom_point(mapping=aes(x=displ,y=hwy))+facet_grid(~class)

#facet_grid(drv~cyl): Facets the plot based on the interaction between the drv (drive train)
#and cyl (number of cylinders) variables
ggplot(data=mpg)+geom_point(mapping=aes(x=displ,y=hwy))+facet_grid(drv~cyl)

#### Geo_Smooth################
ggplot(data=mpg)+geom_smooth(mapping=aes(x=displ,y=cyl))
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

#drv variable is mapped to the linetype aesthetic, meaning that different drive train types (drv) 
#will be represented by different line types on the plot.
ggplot(data=mpg)+geom_smooth(mapping=aes(x=displ,y=hwy, linetype=drv))
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

#By specifying group=drv, ggplot2 will fit separate smoothed lines for each unique value of the drv variable.
#Each line represents the trend between engine displacement (displ) and highway miles per gallon (hwy) for a 
#specific type of drive train (drv).
ggplot(data=mpg)+geom_smooth(mapping=aes(x=displ,y=hwy, group=drv))
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

#+: The + operator is used to add additional layers to the plot.
#geom_smooth(mapping=aes(x=displ, y=hwy)): Adds another smoothed line to the plot, again representing
#the relationship between engine displacement (displ) and highway miles per gallon (hwy).
ggplot(data=mpg)+geom_smooth(mapping=aes(x=displ,y=hwy)) + geom_smooth(mapping=aes(x=displ, y=hwy))
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

#(1) ggplot(data=mpg, mapping=aes(x=displ,y=hwy)): Initializes the plot using the mpg dataset as the data
#source and specifies the aesthetic mappings for the x-axis (displ) and y-axis (hwy).
#(2)geom_point(mapping=aes(color=class)): Adds points to the plot, where color=class specifies that the color 
#of the points should be mapped to the "class" variable. This means that each point will be colored 
#according to the vehicle class it belongs to.
#(3)geom_smooth(): Adds a smoothed line to the plot. Since no specific aesthetic mappings are provided,
#ggplot2 will use the default settings for the smoothing method and parameters.
ggplot(data=mpg, mapping=aes(x=displ,y=hwy)) + geom_point(mapping=aes(color=class)) + geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

#filter(mpg, class=="subcompact") filters the mpg dataset to include only rows where the "class" 
#variable is equal to "subcompact". This subset of data is then used for fitting the smoothed line.
ggplot(data=mpg, mapping=aes(x=displ,y=hwy)) + geom_point(mapping=aes(color=class))+ geom_smooth(data=filter(mpg, class=="subcompact"), se=FALSE) 
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

#filter(mpg, class=="minivan") filters the mpg dataset to include only rows where the "class" 
#variable is equal to "minivan". This subset of data is then used for fitting the smoothed lin
ggplot(data=mpg, mapping=aes(x=displ,y=hwy)) + geom_point(mapping=aes(color=class)) + geom_smooth(data=filter(mpg, class=="minivan"), se=FALSE) 
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 4.008
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 0.708
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : There are other near singularities as well. 0.25

#?diamonds: This is a command to access the help documentation for the diamonds dataset. 
#Running ?diamonds will open the help page in R, providing detailed information about the
#dataset, including its description, variables

#head(diamonds): This is a command to view the first few rows of the diamonds dataset.
#Running head(diamonds) will display the first 6 rows 


#will display the first 6 rows of the dataset in the R console, allowing you to quickly
#inspect its structure and contents.
head(diamonds)
## # A tibble: 6 × 10
##   carat cut       color clarity depth table price     x     y     z
##   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1  0.23 Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
## 2  0.21 Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
## 3  0.23 Good      E     VS1      56.9    65   327  4.05  4.07  2.31
## 4  0.29 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
## 5  0.31 Good      J     SI2      63.3    58   335  4.34  4.35  2.75
## 6  0.24 Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48
#will open the help page in R, providing detailed information about the dataset, 
#including its description, variables, and usage examples.
?diamonds

## str(diamonds)  shows all the structure of the individual attributes
str(diamonds)
## tibble [53,940 × 10] (S3: tbl_df/tbl/data.frame)
##  $ carat  : num [1:53940] 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
##  $ cut    : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
##  $ color  : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
##  $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
##  $ depth  : num [1:53940] 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
##  $ table  : num [1:53940] 55 61 65 58 58 57 57 55 61 61 ...
##  $ price  : int [1:53940] 326 326 327 334 335 336 336 337 337 338 ...
##  $ x      : num [1:53940] 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
##  $ y      : num [1:53940] 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
##  $ z      : num [1:53940] 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
#It gives statistical summaries for each numerical variable in the dataset 
#(carat, depth, table, price, x, y, z), such as minimum, 1st quartile, median, mean, 3rd quartile, and maximum values.
summary(diamonds)
##      carat               cut        color        clarity          depth      
##  Min.   :0.2000   Fair     : 1610   D: 6775   SI1    :13065   Min.   :43.00  
##  1st Qu.:0.4000   Good     : 4906   E: 9797   VS2    :12258   1st Qu.:61.00  
##  Median :0.7000   Very Good:12082   F: 9542   SI2    : 9194   Median :61.80  
##  Mean   :0.7979   Premium  :13791   G:11292   VS1    : 8171   Mean   :61.75  
##  3rd Qu.:1.0400   Ideal    :21551   H: 8304   VVS2   : 5066   3rd Qu.:62.50  
##  Max.   :5.0100                     I: 5422   VVS1   : 3655   Max.   :79.00  
##                                     J: 2808   (Other): 2531                  
##      table           price             x                y         
##  Min.   :43.00   Min.   :  326   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.:56.00   1st Qu.:  950   1st Qu.: 4.710   1st Qu.: 4.720  
##  Median :57.00   Median : 2401   Median : 5.700   Median : 5.710  
##  Mean   :57.46   Mean   : 3933   Mean   : 5.731   Mean   : 5.735  
##  3rd Qu.:59.00   3rd Qu.: 5324   3rd Qu.: 6.540   3rd Qu.: 6.540  
##  Max.   :95.00   Max.   :18823   Max.   :10.740   Max.   :58.900  
##                                                                   
##        z         
##  Min.   : 0.000  
##  1st Qu.: 2.910  
##  Median : 3.530  
##  Mean   : 3.539  
##  3rd Qu.: 4.040  
##  Max.   :31.800  
## 
#This command provides a summary specifically for the cut
#variable in the diamonds dataset.
#Since cut is a categorical variable indicating the quality of the cut
#(e.g., Fair, Good, Very Good, Premium
summary(diamonds$cut)
##      Fair      Good Very Good   Premium     Ideal 
##      1610      4906     12082     13791     21551
#summary(diamonds$cut): This command provides a summary specifically for the cut variable in the diamonds dataset. 
#Since cut is a categorical variable indicating the quality of the cut (e.g., Fair, Good, Very Good, Premium, Ideal),
#summary() will display the frequency of each level of the cut variable.
summary(diamonds$carat)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.2000  0.4000  0.7000  0.7979  1.0400  5.0100
##to view it in a table form
view(diamonds)

#The mean() function calculates the arithmetic mean (average) of a numeric vector in R
val <- c(46,34,87,22,91)
mean(val)
## [1] 56
#This command will calculate the mean of the price variable in the diamonds dataset, 
#which represents the average price of diamonds in the dataset
mean(diamonds$price)
## [1] 3932.8
#This commands returns summary statistics such as the minimum, 1st quartile, median, mean,
#3rd quartile, and maximum values, along with the number of missing values.
summary(diamonds$price)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     326     950    2401    3933    5324   18823