library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(summarytools)

# Loading the dataset

msleep <- read.csv("C:/Users/ABHIRAM/Downloads/msleep.csv")


# 1. Numeric Summary of Data
# Summarizing at least 10 columns of data(this dataset has only 11 columns in total)

numeric_summary <- msleep %>%
  select(sleep_total, sleep_rem, awake, brainwt, bodywt) %>%
  summarytools::descr()

numeric_summary
## Descriptive Statistics  
## msleep  
## N: 83  
## 
##                      awake    bodywt   brainwt   sleep_rem   sleep_total
## ----------------- -------- --------- --------- ----------- -------------
##              Mean    13.57    166.14      0.28        1.88         10.43
##           Std.Dev     4.45    786.84      0.98        1.30          4.45
##               Min     4.10      0.00      0.00        0.10          1.90
##                Q1    10.20      0.15      0.00        0.90          7.70
##            Median    13.90      1.67      0.01        1.50         10.10
##                Q3    16.30     50.00      0.14        2.40         13.80
##               Max    22.10   6654.00      5.71        6.60         19.90
##               MAD     5.04      2.43      0.02        1.19          5.04
##               IQR     5.90     41.58      0.12        1.50          5.90
##                CV     0.33      4.74      3.47        0.69          0.43
##          Skewness    -0.05      7.10      4.63        1.46          0.05
##       SE.Skewness     0.26      0.26      0.32        0.31          0.26
##          Kurtosis    -0.71     53.72     20.96        2.73         -0.71
##           N.Valid    83.00     83.00     56.00       61.00         83.00
##         Pct.Valid   100.00    100.00     67.47       73.49        100.00
# 2. Categorical Summary
# For categorical columns (genus, vore, order, conservation), showing unique values and counts

categorical_summary <- msleep %>%
  select(genus, vore, order, conservation) %>%
  lapply(function(x) table(factor(x)))

categorical_summary
## $genus
## 
##      Acinonyx         Aotus    Aplodontia       Blarina           Bos 
##             1             1             1             1             1 
##      Bradypus   Callorhinus       Calomys         Canis     Capreolus 
##             1             1             1             1             1 
##         Capri         Cavis Cercopithecus    Chinchilla     Condylura 
##             1             1             1             1             1 
##    Cricetomys     Cryptotis       Dasypus   Dendrohyrax     Didelphis 
##             1             1             1             1             1 
##       Elephas     Eptesicus         Equus     Erinaceus  Erythrocebus 
##             1             1             2             1             1 
##      Eutamias         Felis        Galago       Genetta       Giraffa 
##             1             1             1             1             1 
## Globicephalus  Haliochoerus   Heterohyrax          Homo         Lemur 
##             1             1             1             1             1 
##     Loxodonta    Lutreolina        Macaca      Meriones  Mesocricetus 
##             1             1             1             1             1 
##      Microtus           Mus        Myotis      Neofiber     Nyctibeus 
##             1             1             1             1             1 
##       Octodon     Onychomys   Oryctolagus          Ovis           Pan 
##             1             1             1             1             1 
##      Panthera         Papio   Paraechinus  Perodicticus    Peromyscus 
##             3             1             1             1             1 
##     Phalanger         Phoca      Phocoena      Potorous    Priodontes 
##             1             1             1             1             1 
##      Procavia        Rattus     Rhabdomys       Saimiri      Scalopus 
##             1             1             1             1             1 
##      Sigmodon        Spalax  Spermophilus        Suncus           Sus 
##             1             1             3             1             1 
##  Tachyglossus        Tamias       Tapirus        Tenrec        Tupaia 
##             1             1             1             1             1 
##      Tursiops        Vulpes 
##             1             2 
## 
## $vore
## 
##   carni   herbi insecti    omni 
##      19      32       5      20 
## 
## $order
## 
##    Afrosoricida    Artiodactyla       Carnivora         Cetacea      Chiroptera 
##               1               6              12               3               2 
##       Cingulata Didelphimorphia   Diprotodontia  Erinaceomorpha      Hyracoidea 
##               2               2               2               2               3 
##      Lagomorpha     Monotremata  Perissodactyla          Pilosa        Primates 
##               1               1               3               1              12 
##     Proboscidea        Rodentia      Scandentia    Soricomorpha 
##               2              22               1               5 
## 
## $conservation
## 
##           cd domesticated           en           lc           nt           vu 
##            2           10            4           27            4            7
# 3. Combined Summary
# Combining the numeric and categorical summaries

combined_summary <- list(Numeric_Summary = numeric_summary, Categorical_Summary = categorical_summary)

combined_summary
## x must either be a summarytools object created with freq(), descr(), or a list of summarytools objects created using by()
# 4. Novel Questions
# Question 1: What is the distribution of sleep_total across different orders?

q1 <- msleep %>%
  group_by(order) %>%
  summarise(Avg_Sleep_Total = mean(sleep_total, na.rm = TRUE), 
            Median_Sleep_Total = median(sleep_total, na.rm = TRUE))

q1
## # A tibble: 19 × 3
##    order           Avg_Sleep_Total Median_Sleep_Total
##    <chr>                     <dbl>              <dbl>
##  1 Afrosoricida              15.6                15.6
##  2 Artiodactyla               4.52                3.9
##  3 Carnivora                 10.1                10.2
##  4 Cetacea                    4.5                 5.2
##  5 Chiroptera                19.8                19.8
##  6 Cingulata                 17.8                17.8
##  7 Didelphimorphia           18.7                18.7
##  8 Diprotodontia             12.4                12.4
##  9 Erinaceomorpha            10.2                10.2
## 10 Hyracoidea                 5.67                5.4
## 11 Lagomorpha                 8.4                 8.4
## 12 Monotremata                8.6                 8.6
## 13 Perissodactyla             3.47                3.1
## 14 Pilosa                    14.4                14.4
## 15 Primates                  10.5                 9.9
## 16 Proboscidea                3.6                 3.6
## 17 Rodentia                  12.5                12.9
## 18 Scandentia                 8.9                 8.9
## 19 Soricomorpha              11.1                10.3
# Question 2: Is there a correlation between body weight and brain weight?

q2 <- msleep %>%
  select(bodywt, brainwt) %>%
  na.omit() %>%
  summarise(Correlation = cor(bodywt, brainwt))

q2
##   Correlation
## 1   0.9337822
# Question 3: What is the most common vore type for each order?

q3 <- msleep %>%
  group_by(order, vore) %>%
  summarise(Count = n()) %>%
  arrange(order, desc(Count)) %>%
  slice(1)
## `summarise()` has grouped output by 'order'. You can override using the
## `.groups` argument.
q3
## # A tibble: 19 × 3
## # Groups:   order [19]
##    order           vore    Count
##    <chr>           <chr>   <int>
##  1 Afrosoricida    omni        1
##  2 Artiodactyla    herbi       5
##  3 Carnivora       carni      12
##  4 Cetacea         carni       3
##  5 Chiroptera      insecti     2
##  6 Cingulata       carni       1
##  7 Didelphimorphia carni       1
##  8 Diprotodontia   herbi       1
##  9 Erinaceomorpha  omni        1
## 10 Hyracoidea      herbi       2
## 11 Lagomorpha      herbi       1
## 12 Monotremata     insecti     1
## 13 Perissodactyla  herbi       3
## 14 Pilosa          herbi       1
## 15 Primates        omni       10
## 16 Proboscidea     herbi       2
## 17 Rodentia        herbi      16
## 18 Scandentia      omni        1
## 19 Soricomorpha    omni        3
# Question 4: How does conservation status vary among different genera?

q4 <- msleep %>%
  group_by(genus, conservation) %>%
  summarise(Count = n()) %>%
  arrange(genus, desc(Count)) %>%
  slice(1)
## `summarise()` has grouped output by 'genus'. You can override using the
## `.groups` argument.
q4
## # A tibble: 77 × 3
## # Groups:   genus [77]
##    genus       conservation Count
##    <chr>       <chr>        <int>
##  1 Acinonyx    lc               1
##  2 Aotus       <NA>             1
##  3 Aplodontia  nt               1
##  4 Blarina     lc               1
##  5 Bos         domesticated     1
##  6 Bradypus    <NA>             1
##  7 Callorhinus vu               1
##  8 Calomys     <NA>             1
##  9 Canis       domesticated     1
## 10 Capreolus   lc               1
## # ℹ 67 more rows
# Question 5: Is there a relationship between sleep patterns (e.g., sleep_total) and conservation status?

q5 <- msleep %>%
  group_by(conservation) %>%
  summarise(Avg_Sleep_Total = mean(sleep_total, na.rm = TRUE),
            Median_Sleep_Total = median(sleep_total, na.rm = TRUE))

q5
## # A tibble: 7 × 3
##   conservation Avg_Sleep_Total Median_Sleep_Total
##   <chr>                  <dbl>              <dbl>
## 1 cd                      2.3                2.3 
## 2 domesticated            7.58               8.75
## 3 en                     13.0               15.0 
## 4 lc                     11.4               10.9 
## 5 nt                     13.0               13.4 
## 6 vu                      6.93               5.6 
## 7 <NA>                   11.2               10.6
# 6. Visual Summary
# Visualization 1: Distribution of sleep_total

ggplot(msleep, aes(x = sleep_total)) +
  geom_histogram(fill = "skyblue", color = "black", bins = 20) +
  labs(title = "Distribution of Sleep Total", x = "Sleep Total Hours")

# Visualization 2: Scatterplot of bodywt vs. brainwt

ggplot(msleep, aes(x = bodywt, y = brainwt)) +
  geom_point(aes(color = vore), alpha = 0.7) +
  labs(title = "Scatterplot of Body Weight vs. Brain Weight", 
       x = "Body Weight (kg)", y = "Brain Weight (kg)")
## Warning: Removed 27 rows containing missing values (`geom_point()`).

# Visualization 3: Boxplot of sleep_total by order

ggplot(msleep, aes(x = order, y = sleep_total)) +
  geom_boxplot(fill = "lightgreen", color = "black") +
  coord_flip() +
  labs(title = "Sleep Total by Order", x = "Order", y = "Sleep Total Hours")

# Visualization 4: Bar chart of conservation status counts

ggplot(msleep, aes(x = conservation)) +
  geom_bar(fill = "salmon") +
  labs(title = "Conservation Status Counts", x = "Conservation Status", y = "Count")

# Visualization 5: Bar chart of vore counts

ggplot(msleep, aes(x = vore)) +
  geom_bar(fill = "lightblue") +
  labs(title = "Vore Type Counts", x = "Vore Type", y = "Count")