? `datasets-package`
## starting httpd help server ... done
library(help = "datasets")
data("iris")
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
years <- c(1980, 1980, 1985, 1990)
scores<- c( 34, 44, 56, 83)
df <- data.frame(years, scores)
df
## years scores
## 1 1980 34
## 2 1980 44
## 3 1985 56
## 4 1990 83
df[,1]
## [1] 1980 1980 1985 1990
df[,2]
## [1] 34 44 56 83
df$years
## [1] 1980 1980 1985 1990
df$scores
## [1] 34 44 56 83
df[df$scores<50,]
## years scores
## 1 1980 34
## 2 1980 44
df[df$year==1980, "scores"]
## [1] 34 44
#Subsetting
df[df$scores<50]
## years scores
## 1 1980 34
## 2 1980 44
## 3 1985 56
## 4 1990 83
#Adding new Attribute
df$Age<-c("..")
df
## years scores Age
## 1 1980 34 ..
## 2 1980 44 ..
## 3 1985 56 ..
## 4 1990 83 ..
#creating a Data frame
subject_name <-c("John doe","james doe","Steve Graves")
Temperature <-c(98.1,98.6,101.4)
flu_status <-c(FALSE,FALSE,FALSE)
df<-data.frame(subject_name,Temperature,flu_status)
print(df)
## subject_name Temperature flu_status
## 1 John doe 98.1 FALSE
## 2 james doe 98.6 FALSE
## 3 Steve Graves 101.4 FALSE
# Toy medical test
subject_name <- c("John Doe", "James Doe", "Steve Graves")
Temperature <- c(98.1, 98.6, 101.4)
flu_status <- c(FALSE, FALSE, TRUE)
df <- data.frame(subject_name, Temperature, flu_status)
print(df)
## subject_name Temperature flu_status
## 1 John Doe 98.1 FALSE
## 2 James Doe 98.6 FALSE
## 3 Steve Graves 101.4 TRUE
blood<-factor(c("O","AB","A"),levels=c("A","AB","O"))
blood
## [1] O AB A
## Levels: A AB O
# to access the positon of each patient on the data set
Temperature[2]
## [1] 98.6
Temperature[2:3]
## [1] 98.6 101.4
Temperature[-2]
## [1] 98.1 101.4
Temperature[-3]
## [1] 98.1 98.6
#A factor is a kind of vector that store categorical data or ordinal varibale
#reprstented as # factor( )
# creating a factor from a charcater vector
gender<-factor(c("male","Female","male"))
gender
## [1] male Female male
## Levels: Female male
# creating an ordinal factor
#an ordinal factor is a kind of factor where the levles have a natural ordering or heirachy
# an ordinal factor is represented as factor ( ) and sepcifyng the levels in desired order
education <- factor(c( "highschool","college","Graduate"))
levels=c("highschool","college","graduate")
education
## [1] highschool college Graduate
## Levels: college Graduate highschool
education <- ordered(c( "highschool","college","graduate"))
levels=c("highschool","college","graduate")
education
## [1] highschool college graduate
## Levels: college < graduate < highschool
symptoms <-factor (c("severe","mild","moderate"),levels=c("mild","moderate","severe"),ordered=TRUE)
symptoms
## [1] severe mild moderate
## Levels: mild < moderate < severe
#To Test wether each patients symptoms are more severe and moderate:
symptoms>"moderate"
## [1] TRUE FALSE FALSE
#instead of displaying the patient name,temperature ,flu_status one after the order in ths way:
subject_name[1]
## [1] "John Doe"
Temperature[1]
## [1] 98.1
flu_status[1]
## [1] FALSE
gender[1]
## [1] male
## Levels: Female male
blood[1]
## [1] O
## Levels: A AB O
symptoms[1]
## [1] severe
## Levels: mild < moderate < severe
#creating a data frame for this information
pt_data <-data.frame(subject_name,Temperature,flu_status,gender,blood,symptoms,stringsAsFactors = FALSE)
pt_data
## subject_name Temperature flu_status gender blood symptoms
## 1 John Doe 98.1 FALSE male O severe
## 2 James Doe 98.6 FALSE Female AB mild
## 3 Steve Graves 101.4 TRUE male A moderate
#Accessng Data in the data frame
pt_data$subject_name
## [1] "John Doe" "James Doe" "Steve Graves"
#To extract the value in the first row and second column of the patient data frame
pt_data[1,2]
## [1] 98.1
#To refer to every row or every column ,simply leave the row or column portion blank
#TO EXTRACT LL ROWS OF THE FIRST COLUMN
pt_data[,1]
## [1] "John Doe" "James Doe" "Steve Graves"
#to pull data of the first row & third row second & fourth column
pt_data[c(1,3),c(2,4)]
## Temperature gender
## 1 98.1 male
## 3 101.4 male
#to extract every information
pt_data[,]
## subject_name Temperature flu_status gender blood symptoms
## 1 John Doe 98.1 FALSE male O severe
## 2 James Doe 98.6 FALSE Female AB mild
## 3 Steve Graves 101.4 TRUE male A moderate
#note that columns are better accessed by name rather than positions
pt_data[c(1,3),c("Temperature","gender")]
## Temperature gender
## 1 98.1 male
## 3 101.4 male
#creating new columns in a dataframe
pt_data$temp_c<-(pt_data$Temperature-32)*(5/9)
pt_data
## subject_name Temperature flu_status gender blood symptoms temp_c
## 1 John Doe 98.1 FALSE male O severe 36.72222
## 2 James Doe 98.6 FALSE Female AB mild 37.00000
## 3 Steve Graves 101.4 TRUE male A moderate 38.55556
#checking on our added column and to confirm the calculation worked
#lets compare the new celsius -based temp_c column to the previous Farenheit-scale temperature column:
pt_data[c("Temperature","temp_c")]
## Temperature temp_c
## 1 98.1 36.72222
## 2 98.6 37.00000
## 3 101.4 38.55556
#creating a data frame for this information
pt_data <-data.frame(subject_name,Temperature,flu_status,gender,blood,symptoms,stringsAsFactors = FALSE)
pt_data
## subject_name Temperature flu_status gender blood symptoms
## 1 John Doe 98.1 FALSE male O severe
## 2 James Doe 98.6 FALSE Female AB mild
## 3 Steve Graves 101.4 TRUE male A moderate
#Accessng Data in the data frame
pt_data$subject_name
## [1] "John Doe" "James Doe" "Steve Graves"
#To extract the value in the first row and second column of the patient data frame
pt_data[1,2]
## [1] 98.1
#To refer to every row or every column ,simply leave the row or column portion blank
#TO EXTRACT LL ROWS OF THE FIRST COLUMN
pt_data[,1]
## [1] "John Doe" "James Doe" "Steve Graves"
#to pull data of the first row & third row second & fourth column
pt_data[c(1,3),c(2,4)]
## Temperature gender
## 1 98.1 male
## 3 101.4 male
#to extract every information
pt_data[,]
## subject_name Temperature flu_status gender blood symptoms
## 1 John Doe 98.1 FALSE male O severe
## 2 James Doe 98.6 FALSE Female AB mild
## 3 Steve Graves 101.4 TRUE male A moderate
#note that columns are better accessed by name rather than positions
pt_data[c(1,3),c("Temperature","gender")]
## Temperature gender
## 1 98.1 male
## 3 101.4 male
#creating new columns in a dataframe
pt_data$temp_c<-(pt_data$Temperature-32)*(5/9)
pt_data
## subject_name Temperature flu_status gender blood symptoms temp_c
## 1 John Doe 98.1 FALSE male O severe 36.72222
## 2 James Doe 98.6 FALSE Female AB mild 37.00000
## 3 Steve Graves 101.4 TRUE male A moderate 38.55556
#checking on our added column and to confirm the calculation worked
#lets compare the new celsius -based temp_c column to the previous Farenheit-scale temperature column:
pt_data[c("Temperature","temp_c")]
## Temperature temp_c
## 1 98.1 36.72222
## 2 98.6 37.00000
## 3 101.4 38.55556
#we need to load the required libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(dplyr)
install.packages("tidyverse")
## Warning: package 'tidyverse' is in use and will not be installed
# this to confirm if ggplot is available
ggplot()

#so to plot mpg ,we run the below code to put(display) displ
#on x axis and y axis hwy(higwh way miles)
# so if you get error ,you have o first run gplot library ,then relod the below code
ggplot(data=mpg)+geom_point(mapping=aes(x=displ,y=hwy))

#this to confirm if ggplot is available
ggplot()

#so to plot mpg ,we run the below code to put(display) displ
#on x axis and y axis hwy(higwh way miles)
# so if you get error ,you have o first run gplot library ,then relod the below code
ggplot(data=mpg)+geom_point(mapping=aes(x=displ,y=hwy))

# diplaying ggplot for displacement and city milage
ggplot(data=mpg)+geom_point(mapping=aes(x=displ,y=cty))

# displaying ggplot for dsialacement and number of cylinders
ggplot(data=mpg)+geom_point(mapping=aes(x=displ,y=cyl))

# displaying ggplot and geom_smooth for dsialacement and number of cylinders
# geom_smooth is use to add a smooth line or curve to a plot
ggplot(data=mpg)+geom_smooth(mapping=aes(x=displ,y=cyl))
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

#diplaying ggplot for displacement and hwy and class
ggplot(data=mpg)+geom_point(mapping=aes(x=displ,y=hwy, color=class))

#diplaying ggplot for displacement and hwy and class, just making the point green
ggplot(data=mpg)+geom_point(mapping=aes(x=displ,y=hwy), color="green")

#diplaying ggplot for displacement and hwy and class, just making the point green
ggplot(data=mpg)+geom_point(mapping=aes(x=displ,y=hwy, shape=class))
## Warning: The shape palette can deal with a maximum of 6 discrete values because more
## than 6 becomes difficult to discriminate
## ℹ you have requested 7 values. Consider specifying shapes manually if you need
## that many have them.
## Warning: Removed 62 rows containing missing values (`geom_point()`).

#diplaying ggplot for displacement and hwy and class size,
#This means that points belonging to different classes will have different sizes.
ggplot(data=mpg)+geom_point(mapping=aes(x=displ,y=hwy, size=class))
## Warning: Using size for a discrete variable is not advised.

#diplaying ggplot for displacement and hwy and class size,
#different classes will have different transparency levels.
ggplot(data=mpg)+geom_point(mapping=aes(x=displ,y=hwy, alpha=class))
## Warning: Using alpha for a discrete variable is not advised.

###Using Facet and subplot
#Faceting: Faceting is the process of breaking data into subsets and displaying those
#subsets in separate panels (also known as small multiples) within the same plot.
#Faceting is typically done based on one or more categorical variables.
#Each panel shows a different subset of the data, making it easier to compare different groups or categories.
#facet_wrap(~ class, nrow = 2) splits the data into panels based on the "class" variable,
#with two rows of panels. Each panel represents a different vehicle class.
ggplot(data=mpg)+geom_point(mapping=aes(x=displ,y=hwy))+facet_wrap(~class,nrow=2)

#facet_wrap(~ class, nrow = 3) splits the data into panels based on the "class" variable,
#with three rows of panels. Each panel represents a different vehicle class.
#note class is a variable in the mpg dataset that categorizes vehicles into different classes such
#as "subcompact", "compact", "midsize", etc.
#When you use facet_wrap(~ class) or facet_grid(class ~ .) in ggplot2,
#you're telling ggplot2 to create separate panels for each level of the "class" variable.
ggplot(data=mpg)+geom_point(mapping=aes(x=displ,y=hwy))+facet_wrap(~class,nrow=3)

#When you use facet_wrap(~ class) or facet_grid(class ~ .) in ggplot2,
#you're telling ggplot2 to create separate panels for each level of the "class" variable
ggplot(data=mpg)+geom_point(mapping=aes(x=displ,y=hwy))+facet_grid(~class)

#facet_grid(drv~cyl): Facets the plot based on the interaction between the drv (drive train)
#and cyl (number of cylinders) variables
ggplot(data=mpg)+geom_point(mapping=aes(x=displ,y=hwy))+facet_grid(drv~cyl)

#### Geo_Smooth################
ggplot(data=mpg)+geom_smooth(mapping=aes(x=displ,y=cyl))
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

#drv variable is mapped to the linetype aesthetic, meaning that different drive train types (drv)
#will be represented by different line types on the plot.
ggplot(data=mpg)+geom_smooth(mapping=aes(x=displ,y=hwy, linetype=drv))
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

#By specifying group=drv, ggplot2 will fit separate smoothed lines for each unique value of the drv variable.
#Each line represents the trend between engine displacement (displ) and highway miles per gallon (hwy) for a
#specific type of drive train (drv).
ggplot(data=mpg)+geom_smooth(mapping=aes(x=displ,y=hwy, group=drv))
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

#+: The + operator is used to add additional layers to the plot.
#geom_smooth(mapping=aes(x=displ, y=hwy)): Adds another smoothed line to the plot, again representing
#the relationship between engine displacement (displ) and highway miles per gallon (hwy).
ggplot(data=mpg)+geom_smooth(mapping=aes(x=displ,y=hwy)) + geom_smooth(mapping=aes(x=displ, y=hwy))
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

#(1) ggplot(data=mpg, mapping=aes(x=displ,y=hwy)): Initializes the plot using the mpg dataset as the data
#source and specifies the aesthetic mappings for the x-axis (displ) and y-axis (hwy).
#(2)geom_point(mapping=aes(color=class)): Adds points to the plot, where color=class specifies that the color
#of the points should be mapped to the "class" variable. This means that each point will be colored
#according to the vehicle class it belongs to.
#(3)geom_smooth(): Adds a smoothed line to the plot. Since no specific aesthetic mappings are provided,
#ggplot2 will use the default settings for the smoothing method and parameters.
ggplot(data=mpg, mapping=aes(x=displ,y=hwy)) + geom_point(mapping=aes(color=class)) + geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

#filter(mpg, class=="subcompact") filters the mpg dataset to include only rows where the "class"
#variable is equal to "subcompact". This subset of data is then used for fitting the smoothed line.
ggplot(data=mpg, mapping=aes(x=displ,y=hwy)) + geom_point(mapping=aes(color=class))+ geom_smooth(data=filter(mpg, class=="subcompact"), se=FALSE)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

#filter(mpg, class=="minivan") filters the mpg dataset to include only rows where the "class"
#variable is equal to "minivan". This subset of data is then used for fitting the smoothed lin
ggplot(data=mpg, mapping=aes(x=displ,y=hwy)) + geom_point(mapping=aes(color=class)) + geom_smooth(data=filter(mpg, class=="minivan"), se=FALSE)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 4.008
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 0.708
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : There are other near singularities as well. 0.25

#?diamonds: This is a command to access the help documentation for the diamonds dataset.
#Running ?diamonds will open the help page in R, providing detailed information about the
#dataset, including its description, variables
#head(diamonds): This is a command to view the first few rows of the diamonds dataset.
#Running head(diamonds) will display the first 6 rows
#will display the first 6 rows of the dataset in the R console, allowing you to quickly
#inspect its structure and contents.
head(diamonds)
## # A tibble: 6 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
#will open the help page in R, providing detailed information about the dataset,
#including its description, variables, and usage examples.
?diamonds
## str(diamonds) shows all the structure of the individual attributes
str(diamonds)
## tibble [53,940 × 10] (S3: tbl_df/tbl/data.frame)
## $ carat : num [1:53940] 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
## $ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
## $ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
## $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
## $ depth : num [1:53940] 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
## $ table : num [1:53940] 55 61 65 58 58 57 57 55 61 61 ...
## $ price : int [1:53940] 326 326 327 334 335 336 336 337 337 338 ...
## $ x : num [1:53940] 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
## $ y : num [1:53940] 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
## $ z : num [1:53940] 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
#It gives statistical summaries for each numerical variable in the dataset
#(carat, depth, table, price, x, y, z), such as minimum, 1st quartile, median, mean, 3rd quartile, and maximum values.
summary(diamonds)
## carat cut color clarity depth
## Min. :0.2000 Fair : 1610 D: 6775 SI1 :13065 Min. :43.00
## 1st Qu.:0.4000 Good : 4906 E: 9797 VS2 :12258 1st Qu.:61.00
## Median :0.7000 Very Good:12082 F: 9542 SI2 : 9194 Median :61.80
## Mean :0.7979 Premium :13791 G:11292 VS1 : 8171 Mean :61.75
## 3rd Qu.:1.0400 Ideal :21551 H: 8304 VVS2 : 5066 3rd Qu.:62.50
## Max. :5.0100 I: 5422 VVS1 : 3655 Max. :79.00
## J: 2808 (Other): 2531
## table price x y
## Min. :43.00 Min. : 326 Min. : 0.000 Min. : 0.000
## 1st Qu.:56.00 1st Qu.: 950 1st Qu.: 4.710 1st Qu.: 4.720
## Median :57.00 Median : 2401 Median : 5.700 Median : 5.710
## Mean :57.46 Mean : 3933 Mean : 5.731 Mean : 5.735
## 3rd Qu.:59.00 3rd Qu.: 5324 3rd Qu.: 6.540 3rd Qu.: 6.540
## Max. :95.00 Max. :18823 Max. :10.740 Max. :58.900
##
## z
## Min. : 0.000
## 1st Qu.: 2.910
## Median : 3.530
## Mean : 3.539
## 3rd Qu.: 4.040
## Max. :31.800
##
#This command provides a summary specifically for the cut
#variable in the diamonds dataset.
#Since cut is a categorical variable indicating the quality of the cut
#(e.g., Fair, Good, Very Good, Premium
summary(diamonds$cut)
## Fair Good Very Good Premium Ideal
## 1610 4906 12082 13791 21551
#summary(diamonds$cut): This command provides a summary specifically for the cut variable in the diamonds dataset.
#Since cut is a categorical variable indicating the quality of the cut (e.g., Fair, Good, Very Good, Premium, Ideal),
#summary() will display the frequency of each level of the cut variable.
summary(diamonds$carat)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.2000 0.4000 0.7000 0.7979 1.0400 5.0100
##to view it in a table form
view(diamonds)
#The mean() function calculates the arithmetic mean (average) of a numeric vector in R
val <- c(46,34,87,22,91)
mean(val)
## [1] 56
#This command will calculate the mean of the price variable in the diamonds dataset,
#which represents the average price of diamonds in the dataset
mean(diamonds$price)
## [1] 3932.8
#This commands returns summary statistics such as the minimum, 1st quartile, median, mean,
#3rd quartile, and maximum values, along with the number of missing values.
summary(diamonds$price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 326 950 2401 3933 5324 18823