project02

Author

rachael berghahn

forbes <- read.csv("Forbes2000.csv", stringsAsFactors = TRUE) ##loading dataset

library(dplyr)

Warning: package 'dplyr' was built under R version 4.3.3


Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

profit_by_category <- forbes %>%
  group_by(category) %>%
  summarise(
    avg_profit = mean(profits, na.rm = TRUE),
    count = n()
  ) %>%
  arrange(avg_profit) ##summarizing and arranging the data to find company types with highest and lowest profits

profit_by_category %>% 
  slice_head(n = 5) ##telecommunication services is lowest profited company type

# A tibble: 5 × 3
  category                     avg_profit count
  <fct>                             <dbl> <int>
1 Telecommunications services     -0.908     67
2 Trading companies                0.028     25
3 Capital goods                    0.0955    53
4 Transportation                   0.139     80
5 Business services & supplies     0.171     70

profit_by_category %>% 
  slice_tail(n = 5) ##drugs and biotechnology is highest profited company type

# A tibble: 5 × 3
  category              avg_profit count
  <fct>                      <dbl> <int>
1 Software & services        0.568    31
2 Food drink & tobacco       0.594    83
3 Conglomerates              1.01     31
4 Oil & gas operations       1.31     90
5 Drugs & biotechnology      1.45     45

##finding countries with highest profits by summarizing and arranging the data
forbes %>%
  group_by(country) %>%
  summarise(total_profit = sum(profits, na.rm = TRUE),
            avg_profit = mean(profits, na.rm = TRUE)) %>%
  arrange(desc(total_profit)) ##united states generates highest profits

# A tibble: 61 × 3
   country                     total_profit avg_profit
   <fct>                              <dbl>      <dbl>
 1 United States                      487.       0.652
 2 Canada                              23.3      0.416
 3 United Kingdom                      21.7      0.160
 4 Australia                           18.1      0.502
 5 South Korea                         15.6      0.347
 6 China                               15.5      0.622
 7 Russia                              14.9      1.24 
 8 Switzerland                         13.8      0.404
 9 Spain                               11.7      0.405
10 Netherlands/ United Kingdom         10.6      5.32 
# ℹ 51 more rows

##finding countries with highest sales by summarizing and arranging the data
forbes %>%
  group_by(country) %>%
  summarise(total_sales = sum(sales, na.rm = TRUE),
            avg_sales = mean(sales, na.rm = TRUE)) %>%
  arrange(desc(total_sales)) ##united states generates highest sales

# A tibble: 61 × 3
   country        total_sales avg_sales
   <fct>                <dbl>     <dbl>
 1 United States        7554.     10.1 
 2 Japan                3220.     10.2 
 3 United Kingdom       1431.     10.4 
 4 Germany              1351.     20.8 
 5 France               1266.     20.1 
 6 Netherlands           477.     17.0 
 7 Switzerland           424.     12.5 
 8 Italy                 419.     10.2 
 9 Canada                360.      6.43
10 South Korea           359.      7.97
# ℹ 51 more rows

##using filter to find better ranked country between usa and japan
forbes %>%
  filter(country %in% c("United States", "Japan")) %>%
  group_by(country) %>%
  summarise(best_rank = min(rank, na.rm = TRUE))

# A tibble: 2 × 2
  country       best_rank
  <fct>             <int>
1 Japan                 8
2 United States         1

##usa is ranked at 1, meaning it has the higher rank when compared to japan, which is at 8

library(dplyr)
library(tidyr)

Warning: package 'tidyr' was built under R version 4.3.3

##using filter and count to directly compare both countries, using pivot to create a table directly comparing results from both countries

forbes %>%
  filter(country == "United States") %>%
  count(category, sort = TRUE) %>%
  slice_max(n, n = 5) ##has banking and diversified financials as top 2 types

                          category  n
1                          Banking 83
2           Diversified financials 60
3                        Utilities 54
4 Health care equipment & services 53
5                        Retailing 53

forbes %>%
  filter(country == "Japan") %>%
  count(category, sort = TRUE) %>%
  slice_max(n, n = 5) ##also has banking and diversified financials as top 2 types

                category  n
1                Banking 69
2 Diversified financials 24
3      Consumer durables 22
4         Transportation 20
5          Capital goods 19

forbes %>%
  filter(country %in% c("United States", "Japan")) %>%
  count(country, category) %>%            
  pivot_wider(names_from = country,        
              values_from = n, 
              values_fill = 0) %>%       
  arrange(desc(`United States` + Japan))

# A tibble: 27 × 3
   category                         Japan `United States`
   <fct>                            <int>           <int>
 1 Banking                             69              83
 2 Diversified financials              24              60
 3 Retailing                           12              53
 4 Utilities                           11              54
 5 Health care equipment & services     4              53
 6 Insurance                            9              46
 7 Business services & supplies        17              31
 8 Consumer durables                   22              25
 9 Technology hardware & equipment      6              33
10 Food drink & tobacco                10              28
# ℹ 17 more rows

##while top 2 are similar between both countries, usa is much higher at other types such as retailing and utilities, and is also pretty much higher for every other category japan has many of

profit_model <- lm(profits ~ assets + marketvalue + sales, data = forbes) ##building linear regression model using assets, market value, sales
summary(profit_model) ##looking at summary of linear regression


Call:
lm(formula = profits ~ assets + marketvalue + sales, data = forbes)

Residuals:
     Min       1Q   Median       3Q      Max 
-29.2169  -0.0189   0.1160   0.2107   8.9495 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) -0.1186259  0.0380039  -3.121  0.00183 ** 
assets      -0.0008395  0.0003781  -2.220  0.02651 *  
marketvalue  0.0363340  0.0018183  19.982  < 2e-16 ***
sales        0.0098892  0.0024331   4.064    5e-05 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.472 on 1991 degrees of freedom
  (5 observations deleted due to missingness)
Multiple R-squared:  0.3059,    Adjusted R-squared:  0.3049 
F-statistic: 292.5 on 3 and 1991 DF,  p-value: < 2.2e-16

##all variables have a small p value, meaning it can be suggested that there is a relationship between them and profits

anova(profit_model) ##looking at anova of linear regression

Analysis of Variance Table

Response: profits
              Df Sum Sq Mean Sq F value    Pr(>F)    
assets         1  312.8  312.84  144.40 < 2.2e-16 ***
marketvalue    1 1552.8 1552.77  716.71 < 2.2e-16 ***
sales          1   35.8   35.79   16.52 5.001e-05 ***
Residuals   1991 4313.6    2.17                      
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

##all variables have a tiny value, making them statistically significant, meaning there is a relationship between them and profits, assets and market value have equally small p-values, meaning they both seem to have an equally great effect on profits

hist(residuals(profit_model), main = "Residuals", xlab = "Residuals") ##building distribution of residuals

##histogram is left skewed, suggesting there may be some bias within the linear regression model

forbes_usa <- filter(forbes, country == "United States") ##filtering data to make model of usa
model_usa <- lm(profits ~ assets + marketvalue + sales, data = forbes_usa) ##making linear regression model for usa including assets, market value, and sales
anova(model_usa) ##all variables have tiny p-values, making them signficant on effecting assets

Analysis of Variance Table

Response: profits
             Df Sum Sq Mean Sq  F value    Pr(>F)    
assets        1 975.23  975.23 1479.783 < 2.2e-16 ***
marketvalue   1 895.96  895.96 1359.498 < 2.2e-16 ***
sales         1  30.41   30.41   46.142 2.253e-11 ***
Residuals   744 490.32    0.66                       
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

forbes_japan <- filter(forbes, country == "Japan") ##filtering data to make model of japan
model_japan <- lm(profits ~ assets + marketvalue + sales, data = forbes_japan) ##making linear regression model for japan including same variables (assets, market value, sales)
anova(model_japan) ##assets and market value have significant p-values, sales does not

Analysis of Variance Table

Response: profits
             Df  Sum Sq Mean Sq F value Pr(>F)    
assets        1 284.846 284.846 433.311 <2e-16 ***
marketvalue   1 162.574 162.574 247.310 <2e-16 ***
sales         1   0.024   0.024   0.036 0.8497    
Residuals   312 205.100   0.657                   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## main difference between usa and japan is that assets, market value, and sales all have a significant effect on profits in the usa, while only assets and market value have an impact on profit in japan (sales does not)