library(maps)
library(tidyverse)
library(rvest)  # for html
library(geonames)
library(uniformly) # for sampling uniformly from the sphere 
library(GGally)  # for ggpairs
library(lubridate)  # for parsing time 
library(e1071) # skewness and kurtosis
library(sp)
library(rmarkdown)
library(dplyr)
library(lubridate)
library(ggmap)
library(ggplot2)
library(viridis)
library(lubridate)
library(maptools)
library(mapdata)
library(ggthemes)
library(tibble)
library(GGally)
library(rtweet)
library(DT) 
knitr::opts_chunk$set(echo = FALSE)
options(scipen=999)

Q1. Simulate random points on earth (31 pt)

  1. (10pt) What is the average geodesic distance between two points sampled uniformly on the sphere? simulate \(1000\) random pairs of points and estimate it using the pairwise distances computed for the simulated points.

You may use the function runif_on_sphere from the package uniformly, to simulate points uniformly on the sphere.

Next, repeat the same simulation but this time estimate the average Euclidean distance between two random points on the sphere.
# Helper functions for converting between coordinates.

# You may use and modify the functions. It is your responsibility that 

# the input/output format used in them are correct. 



# Convert to Cartesian coordinates: the function assumes that theta is in [0,pi] and phi is in [0,2pi]

spherical_to_cartesian=function(point)

{
  x=point$r*cos(point$phi)*sin(point$theta)

  y=point$r*sin(point$phi)*sin(point$theta)

  z=point$r*cos(point$theta)
  
  return(list(x=x,y=y,z=z))
}



# Convert to Spherical coordinates: the function assumes that theta is in [0,pi] and phi is in [0,2pi]

cartesian_to_spherical=function(point) 

{

  r=sqrt(point$x**2+point$y**2+point$z**2)

  phi=atan(point$y/point$x) + pi * ( 1 - sign(point$y)*(1+sign(point$x))/2 )  # take care of all sign cases. When x=0 or y=0 exactly, the function may not work correctly, but these are events of measure zero for any continuous distribution over the sphere.

  theta=acos(point$z/r)

  return(list(r=r,phi=phi,theta=theta))

}



# Geodesic distance between points. 

# Input format: p1 and p2 are lists having items: r, phi, theta in spherical coordinates, 

# i.e. phi in [0,2pi] and theta in [0,pi]

geodesic_dist=function(p1, p2)

{  return ( p1$r * acos( cos(p1$theta)*cos(p2$theta) + sin(p1$theta)*sin(p2$theta) * cos(p1$phi-p2$phi) ) )

}
# i will modify the cartesian_to_spherical in order to recive the output of runif_on_sphere

cartesian_to_spherical2=function(point) 

{
  x = point[,1];y=point[,2];z=point[,3]
  
  r= sqrt(x**2+y**2+z**2)

  phi=atan(y/x) + pi * ( 1 - sign(y)*(1+sign(x))/2 )
  
  theta=acos(z/r)
  
  return(list(r=r,phi=phi,theta=theta))

}


geodesic.simulation.function = function(number_of_simulation){
  n=number_of_simulation
  answer= c()
  for (i in 1:n){
    #p1=0 ;    p2=0 ; distance=0; for_geo_p1=0;for_geo_p2=0
    
    p1=runif_on_sphere(1, d=3, r = 6371)
    p2=runif_on_sphere(1, d=3, r = 6371)
    
    for_geo_p1= cartesian_to_spherical2(p1)
    
    for_geo_p2= cartesian_to_spherical2(p2)
    
    distance=geodesic_dist(for_geo_p1,for_geo_p2)
    
    answer=append(answer,distance)
    
  }
  return(answer)
}
# apply the geodesic sumulation

geodesic_sumulation_1a= geodesic.simulation.function(1000)
DT::datatable(round(data.frame(head(geodesic_sumulation_1a,5)),3),caption = " Head of 5 geodesic distances- OR",class = 'display')
cat("Then mean geodesic distance of 1000 simuations on earth is",mean(geodesic_sumulation_1a)," :]" )
## Then mean geodesic distance of 1000 simuations on earth is 9912.483  :]
# apply the Euclidean sumulation

Euclidean.simulation = function(number_of_simulation){
  n=number_of_simulation
  answer= c()
  for (i in 1:n){
    #p1=0
    #p2=0
    #d=0
    p1=runif_on_sphere(1, d=3, r = 6371)
    p2=runif_on_sphere(1, d=3, r = 6371)
    
    CalculateEuclideanDistance=function(vect1, vect2) sqrt(sum((vect1 - vect2)^2))
    
    d= CalculateEuclideanDistance(p1,p2)
    
    answer=append(answer,d)
  }

  return(answer)
}


Euclideanc_sumulation_1a =Euclidean.simulation(1000)
DT::datatable(round(data.frame(head(Euclideanc_sumulation_1a,5),3)),caption = " Head of 5 euclidean distances",class = 'display')
cat("Then mean euclidean distance of 1000 simuations on earth is",mean(Euclideanc_sumulation_1a)," :)" )
## Then mean euclidean distance of 1000 simuations on earth is 8676.459  :)

In question 1 a, i have applied 2 simulations to estimate the mean distances, both euclidean and geodesic.

As known, simulation is useful only if it closely mirrors real-world outcomes.

The simulation predicts that these particular distances will hit in avarage as mentioed above.

Also, in probability theory, the central limit theorem establishes that, in many situations, when independent random variables are summed up, their

properly normalized sum tends toward a normal distribution even if the original variables themselves are not normally distributed,

and here i have estimated the average distance using 1000 simulation (grater than 30 samples, of course).

#bonus:

Derive an exact mathematical formula for the average geodesic distance, and for the average Euclidean distance as functions of \(r\). Compare the formulas you got to the estimated distances from the simulations.

#Manual geodesic distance

haversine=function(long1, lat1,voxel1, long2, lat2,voxel2) {
  
  long1=long1*pi/180
  
  lat1=lat1*pi/180
  
  long2=long2*pi/180
  
  lat2=lat2*pi/180 
  
  voxel1= voxel1*pi/180
    
  voxel2= voxel2*pi/180

  R=6371 # Earth mean radius [km]
  
  delta.long=(long2 - long1)
  
  delta.lat=(lat2 - lat1)
  
  delta.voxel= (voxel2-voxel1)
  
  distance_geodo= (cos(lat1)*cos(lat2) +sin(lat1)*sin(lat2)*cos(voxel2-voxel1))
  
  distance_geodo2= R*(acos(distance_geodo))
  
  
  #a=sin(delta.lat/2)^2 + cos(lat1) * cos(lat2) * sin(delta.long/2)^2
  
  #c=2 * asin(min(1,sqrt(a)))
  
  #d = R * c
  
  return(distance_geodo2) # Distance in km
  #return(d)
}


# longitude have X , lines of latitudes have Y-values

manaul_dis_geo_sims= function(n){
  
      re= c()
      
       for (i in 1:n){
         
       math_distance_geodesic=0
       
     #  P1=0 ;P2=0;  X1=0 ;       X2=0 ;       Y1=0 ;       Y2=0 ;       Z1=0 ;       Z2=0
       
      P1= runif_on_sphere(1, d=3, r = 6371)
      
      X1= P1[,1]
      Y1=P1[,2]
      Z1= P1[,3]
      
      P2= runif_on_sphere(1, d=3, r = 6371)
      
      X2= P2[,1]
      Y2=P2[,2]
      Z2= P2[,3]

      math_distance_geodesic=haversine(X1,Y1,Z1, X2,Y2,Z2)#(longitude1,latitude1,longitude2,latitude2)
      
      re=append(re,math_distance_geodesic)
       
     }
    return(re)
  
}



result_manual_geo= mean(manaul_dis_geo_sims(1000))

#length(manaul_dis_geo_sims(1000)) ==1000

#result_manual_geo
#Manual Euclidean distance

manaul_dis_euc_sims= function(n){
    re= c()
     for (i in 1:n){

     # math_euc= 0 ;  P1=0 ;P2=0 ;     X1=0 ;       X2=0 ;       Y1=0 ;       Y2=0 ;       Z1=0 ;       Z2=0
      
      P1= runif_on_sphere(1, d=3, r = 6371)
      
      X1= P1[,1]
      Y1=P1[,2]
      Z1= P1[,3]
      
      P2= runif_on_sphere(1, d=3, r = 6371)
      
      X2= P2[,1]
      Y2=P2[,2]
      Z2= P2[,3]

      math_euc= sqrt (sum ((X1-X2)^2+(Y1-Y2)^2+(Z1-Z2)^2) )
     
     # sample_for_eucXy1= sample_for_euc[,1] ;sample_for_geoxy2= sample_for_euc[,2]
       # math_Euclideanc_sumulation_1a=manual_EuclideanDistance (sample_for_eucXy1,sample_for_eucXy2 ) 
      re=append(re,math_euc)
            }
    return(re)
  
}

result_manual_euc= manaul_dis_euc_sims(1000)
# SUM UP THE 4 SIMULATIONS 

geo_mean_manuall= result_manual_geo

euc_mean=  mean(Euclideanc_sumulation_1a) 

euc_mean_manual= result_manual_euc

meangeodesic_sumulation_1a=mean(geodesic_sumulation_1a)
  
Q1a_bun_table = rbind(mean(geodesic_sumulation_1a),  mean(geo_mean_manuall), euc_mean,mean(euc_mean_manual))

rownames(Q1a_bun_table) = c("Geodesic sumulation - mean", "Geodesic sumulation math-mean", "Euclidean sumulation - mean ", "Euclidean sumulation math-mean")

#Q1a_bun_table= t(Q1a_bun_table)


DT::datatable(round(data.frame(Q1a_bun_table),3),caption =' Formulas Comparison',class = 'display')

Following the above simulations, i have applied new simulations using exact mathematical formulas.

To be exact, the Euclidean calculations are kinda similar, hence i can see approximate mean values as well as completely values such as above .

The difference can be shown in the geodesic mean values. I can assume the change is due to the 2 ramdom samples, as well as approximately changes in the math functions.

Inserting section which led to many questions.

  1. (6pt) Repeat the above simulations, but this time keep all results and plot the empirical distributions for the two types of distance.

Compute the skewness and kurtosis of the two empirical distance distributions.

Which of them looks closer to the Normal distribution? explain.

#geodesic_sumulation_1a_=str(geodesic_sumulation_1a)

#geodesic_sumulation_1a_=na.omit(geodesic_sumulation_1a_)

geo_dist_q1b_OR= ecdf(geodesic_sumulation_1a)

x1b= seq(0, 1, by = 0.02); y1b=qnorm(x1b, mean = 2, sd = 1)

# such as TASK 8 q 2

par(mfrow=c(1,3))    
plot(geo_dist_q1b_OR, ylab="Probability", xlab='Dists', main = "The empirical distributions of geodesic distances ", col=' skyblue3',pch = 19)

plot(x1b,y1b ,main = "P-norm",col = 4,pch = 19)
plot(x1b,y1b,main = " Q-norm",col = 2, phc=19)

####

q1b_Compute= rbind(round(skewness(geodesic_sumulation_1a), 3), round(kurtosis(geodesic_sumulation_1a), 3)) 
q1b_Compute= t(q1b_Compute)
colnames(q1b_Compute)= c(" Geodesic distance skewnes value", "Geodesic distance kurtosis value")


DT::datatable(data.frame(q1b_Compute),caption =' Q 1 B -  Geodesic ',class = 'display')

Here, the geodesic empirical distribution plot is next to 2 plots of the normal distribution in order to compare properly. The geodesic empirical distribution is approximate to the normal plots.

Recall that if skewness value greater than 1 or less than -1 indicates a highly skewed distribution. A value between 0.5 and 1 or -0.5 and -1 is moderately skewed. A value between -0.5 and 0.5 indicates that the distribution is fairly symmetrical. Hence, the geodesic empirical distribution approximately symmetric. So, its looks closer to the Normal distribution.

Also, recall that Kurtosis is a measure of the “tailedness” of the probability distribution. A standard normal distribution has kurtosis of 3 and is recognized as mesokurtic. An increased kurtosis (>3) can be visualized as a thin “bell” with a high peak whereas a decreased kurtosis corresponds to a broadening of the peak and “thickening” of the tails. Kurtosis >3 is recognized as leptokurtic and <3 as platykurtic (lepto=thin; platy=broad). There are four different formats of kurtosis, the simplest is the population kurtosis; the ratio between the fourth moment and the variance. Thus, the value indicate that a distribution is flat and has thin tails.

euc_dist_q1b= ecdf(Euclideanc_sumulation_1a)


par(mfrow=c(1,3))    
plot(euc_dist_q1b, ylab="Probability", xlab='Dists', main = "The empirical distributions of euclidean distances ", col=' skyblue3')
plot(x1b,y1b ,main = "P-norm",col = 4,pch = 19)
plot(x1b,y1b,main = " Q-norm",col = 2, phc=19)

q1b_Compute2= rbind(round(skewness(Euclideanc_sumulation_1a), 3), round(kurtosis(Euclideanc_sumulation_1a), 3))
q1b_Compute2= t(q1b_Compute2)
colnames(q1b_Compute2)= c(" Euclidean distance skewnes value", "Euclidean distance kurtosis value")
DT::datatable(data.frame(q1b_Compute2),caption =' Q 1 B Euclidean ',class = 'display')

Following the above explanations, the euclidean empirical distribution next to 2 plots of the normal distribution in order to compare properly as well. The euclidean empirical distribution is not so close to the normal plots.

The computation of skewness indicates that the distribution isn’t so symmetrical, although approximate

The computation of kurtosis indicate that a distribution is flat and has thin tails, as shown.

  1. (7pt) Write a function that receives as input \(n\), the number of desired samples, and a string representing a country’s name. The function should sample random points from the specific country.

Use rejection sampling, where you can check if a point belongs to a country using the map.where function from the maps package.

Run the function to sample \(1000\) pairs of points in the USA and estimate the average geodesic distance for two random points in USA.

Hint: You may need to convert your points to geographical coordinates.

For which countries do you think that the rejection sampling method would be problematic? How would you improve it? explain in words (no need for analysis/code for this part).

# using https://moodle2.cs.huji.ac.il/nu21/mod/resource/view.php?id=294748 lecture 
reject_sphere_updated = function(n, r)
{
  x=rep(0, n)
  y=rep(0, n)
  z=rep(0, n)
  ctr=1
  while (ctr < n)
  {
    cur_x=runif(1, -r, r)
    cur_y=runif(1, -r, r)
    cur_z=runif(1, -r, r)
    
    #cur_xyz = runif_on_sphere(1, d=3, r = 6371)
    
  if (cur_x^2+cur_y^2+cur_z^3<= r^2)
      
    #if(cur_x^2 + cur_y^2 <= r^2)
    {
      x[ctr]=cur_x # cur_xyz[1]
      y[ctr]=cur_y # cur_xyz[2]
      z[ctr] =cur_z #cur_xyz[3]
      ctr=ctr+1
    }
  }
    re= cbind(x,y,z)
  return(re)  
}

Q1c_1000_t= reject_sphere_updated(1000000,180)

#Q1c_1000_t= reject_sphere_updated(1000000,6371)
Q1c_1000_t2=na.omit(Q1c_1000_t)

checking_vec=map.where(database = "world", Q1c_1000_t2[,1], Q1c_1000_t2[,2])

checking_vec= as.data.frame(checking_vec)

colnames(checking_vec)= c("YES_or_NO")

checking_vecUSA= checking_vec %>% filter(YES_or_NO != "NA" , `YES_or_NO` == "USA" , `YES_or_NO` =="USA:Alaska" )


USA_indexes= which(checking_vec=="USA")
 
q1c_corrdinates_inUSA= Q1c_1000_t2[USA_indexes,] #[,1:3] 


q1c_corrdinates_inUSA= as.data.frame(q1c_corrdinates_inUSA)
library(geodist)

distances_inUS= function(data,n, country= "USA"){
  
  data= data[sample(nrow(data), n), ][,1:2] # dor x,y
  
  vector_of_dists_inUSA=c()
  
  for ( i in 1:n){ 
  ##Convert one or two rectangular objects containing lon-lat coordinates into vector or matrix of geodesic distances in metres.

   d= geodist(data[i,],data[i+1,],measure = "cheap")

  
  vector_of_dists_inUSA= append(vector_of_dists_inUSA,d)
  
  i= i+1
    }
  return(vector_of_dists_inUSA)
}

mean_of_dists_USA=distances_inUS(q1c_corrdinates_inUSA,1000)

mean_of_dists_USA=na.omit(mean_of_dists_USA)


cat(" The average geodesic distance for two random points in USA is",mean(mean_of_dists_USA)/1000," KM due to spatial coordinates using cheap:]" )
##  The average geodesic distance for two random points in USA is 1589.418  KM due to spatial coordinates using cheap:]

In this section i have applied several steps:

Used rejection sampling to receive 1M points as done in lecture 10,

Check if a point belongs to \(USA\) using map.where function from the maps package,

Use a function to estimate the average geodesic distance for two random points in \(USA\).

To optimize processing, i used Cheap Ruler (tried all possible types), for ultra-fast geodesic calculations on a city scale. It implements only a fraction of its methods, has certain limitations, and is only meant to be used in performance- sensitive applications. But where it fits, it can be 100 times faster than using conventional method. The cheap ruler algorithm is the most accurate for distances out to around 100km, beyond which it becomes extremely inaccurate. Average relative errors of Vincenty distances remain generally constant at around 0.2%, while relative errors of cheap-ruler distances out to 100km are around 0.16%.

In my opinion, the countries in which i think that the rejection sampling method would be problematic will be small countries such as Israel. The random sample of normal distribution can produce infinite coordinates and none of them will be some where in the world.

In other words, i have needed almost 1M samples to receive more than 1K exact coordinates in the USA, which required to estimate the average geodesic distance for two random points in the country.

I can try to estimate the probability to get exact possible coordinates in small country, which led me to compute the number of coordinates in of the county and divide it in its sum of whole world coordinates and get the partial property of the county as a approximate prob ( might be an idea…) - inserting task. (New note!!! i have calculate similar task i Q 2! :] great!)

  1. (8pt) A statistician proposes to you two different methods for sampling points on the surface of the sphere of radius \(r\):

(i.) Sample from a uniform distribution \(\phi \sim U[0,2\pi]\) and \(\theta \sim U[0,\pi]\) independently, to get a point in spherical coordinates \((r,\phi,\theta)\).

This point can be converted to Cartesian coordinates to get \((x,y,z)\).

(ii.) Sample from a Normal distribution \(x,y,z \sim N(0,1)\) independently, and then normalize the resulting vector, i.e. set:

$$

(x, y, z) (x, y, z)

$$

For each of the two methods, determine if the method really samples points uniformly on the sphere, or whether there are certain areas of the sphere that will be more/less dense than others. You may use mathematical arguments and/or simulation results with computed statistics/plots to support your conclusion.

Note that the uniform distribution on the sphere has the property that two regions of equal areas on the sphere always have the same probability (this can be defined mathematically in details, but the formal definitions are not needed for the questions).

Recall that we know that the function runif_on_sphere does sample from the uniform distribution over the sphere, and you may compare your two methods to the output of this function.

uni_corr= runif_on_sphere(1000, d=3, r = 6371)

##

#stan_corr3000 = rnorm(3000)

stan_corr_x= rnorm(1000) 
stan_corr_y= rnorm(1000) 
stan_corr_z= rnorm(1000) 

stan_corr_matrix= cbind(stan_corr_x,stan_corr_y,stan_corr_z)

r  = 6371

transfor_corr= r/sqrt(stan_corr_matrix[,1]^2+stan_corr_matrix[,2]^2+stan_corr_matrix[,3]^2)

transformation_corr= r/ (sqrt(stan_corr_x^2+stan_corr_y^2+stan_corr_z^2))


stan_corr_trans_mat= stan_corr_matrix*transfor_corr

colnames(stan_corr_trans_mat)= c("x", "y", "z")

stan_corr_trans_mat= as.data.frame(stan_corr_trans_mat)

# small modificaiton 

cartesian_to_spherical_1d=function(point) {

     r=sqrt(point$x**2+point$y**2+point$z**2)

  phi=atan(point$y/point$x) + pi * ( 1 - sign(point$y)*(1+sign(point$x))/2 ) 
  theta=acos(point$z/r)
  re= data.frame(r,phi,theta)
  return( re )
  }


stan_corr_trans_mat_con= cartesian_to_spherical_1d(stan_corr_trans_mat)

stan_corr_trans_mat_con2=as.data.frame(stan_corr_trans_mat_con)


unif_corr_x= runif(1000) 
unif__corr_y= runif(1000) 
unif__corr_z= runif(1000) 

unif_corr_matrix= cbind(unif_corr_x,unif__corr_y,unif__corr_z)
set.seed(1)

par(mfrow=c(2,2))    
plot(uni_corr, asp = 1, pch = 19, col= 'red', main= 'runif on sphere  values matrix')
plot(stan_corr_trans_mat[,2:3], asp = 1, pch = 19, col= 'darkblue', main= 'rnorm  matrix- without trans')
plot(unif_corr_matrix, asp = 1, pch = 19, col= 'mediumspringgreen', main= 'runif  matrix')
plot(stan_corr_trans_mat_con2[,2:3], asp = 1, pch = 19, col= 'darkblue', main= 'rnorm - with cartesian to spherica transformation ')

I have used sampling to receive values for both uniform distribution and Normal distribution.

Also, sampled from the uniform distribution over the sphere.

All 3 are 1000X3 matrixes in order to create comparison on the sphere.

As mentioned , the uniform distribution sample spherical coordinates.

Also, the Normal distribution sample later the normalization outputs Cartesian coordinates which i have converted using the \(cartesian_to_spherical\) team function.

Thirdly, the \(runif_on_sphere\) sample from the uniform distribution over the sphere immediately.

I have plots the all 3 as shown.

The Normal distribution doesn’t samples points uniformly on the sphere in my opinion.Its plot show that there are certain areas of the sphere that will be more dense than others.

Inserting point is that the normal sampling without the conversion which mention in the test hebrew version may have damaged the sphere attempt.

I have did it as mentioned in question instructions to make sure all method will be coordinates on the sphere.

However, it seems that the uniform distribution samples points less uniformly than the normal on the sphere in my opinion.Its plot show that there are certain areas of the sphere that will be less dense than others.

Q2. Analysis of geographical data (36 pt)

  1. (6pt) Read the list of world countries by area from Wikipedia dataset file here.

Use the rvest package to read the html file and extract the data table representing countries by their area into a data-frame.

Remove rows with country name containing parenthesis ().

Convert the column representing total area to numerical values in square km. For countries where more than one value is available, take the first one. Show the top and bottom two rows of the resulting data-frame.

Q2data = "https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_area" %>% read_html %>% html_nodes("table")


q2_html_data_extract = html_table(Q2data[2], fill = TRUE)[[1]]

#cleaning

colnames(q2_html_data_extract)[3] = "total_field"

q2_html_data_extract$total_field = sub(" .*", "", q2_html_data_extract$total_field) 

q2_html_data_extract$total_field = gsub(",","", q2_html_data_extract$total_field)

q2_html_data_extract$total_field = sub("\\(.*", "", q2_html_data_extract$total_field)

q2_html_data_extract$total_field = sub("<", "", q2_html_data_extract$total_field)  

q2_html_data_extract$total_field = as.numeric(as.character(q2_html_data_extract$total_field)) 

q2_html_data_extract = na.omit(q2_html_data_extract)
DT::datatable(data.frame(head(q2_html_data_extract, 2)),caption =' First 2 rows - area  ',class = 'display')
DT::datatable(data.frame(tail(q2_html_data_extract, 2)),caption =' Last 2 rows - area ',class = 'display')

Here, i have read the matrix from a given Wikipedia html web site using rvest package.

I have removed all rows with country name containing parenthesis.

I have converted the column representing total area to numerical values in square km. For countries where more than one value is available, take the first one.

At last, as displayed above, Show the top and bottom two rows of the resulting data-frames.

  1. (6pt) Repeat the above analysis but this time with the list of world countries by population from here, where instead of the total area column,

convert to numerical values the column representing total population.

Q2data2 = "https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population" %>% read_html %>% html_nodes("table")

q2_html_data_extract1 = html_table(Q2data2[1])[[1]] # reading the data

#cleaning

q2_html_data_extract1$Population = sub(" .*", "", q2_html_data_extract1$Population) 

q2_html_data_extract1$Population = gsub(",","", q2_html_data_extract1$Population)

q2_html_data_extract1$Population = sub("\\(.*", "", q2_html_data_extract1$Population) 

q2_html_data_extract1$Population = as.numeric(as.character(q2_html_data_extract1$Population))  
q2_html_data_extract1 = na.omit(q2_html_data_extract1) #
DT::datatable(data.frame(head(q2_html_data_extract1, 2)),caption =' First 2 rows - population ',class = 'display')
DT::datatable(data.frame(tail(q2_html_data_extract1, 2)),caption =' Last 2 rows - population',class = 'display')

Here as well, overall first [1] data, i have read the matrix from a given Wikipedia html web site using rvest package.

I have removed all rows with country name containing parenthesis.

I have converted the column representing total area to numerical values in square km. For countries where more than one value is available, take the first one.

At last, as displayed above, Show the top and bottom two rows of the resulting data-frames.

  1. (6pt) Merge the two data-frames into a new one (denoted the wikipedia data-frame) and add a new column called pop.density, that lists for each country the number of people per square km.

Show a world-map (you can use the package rworldmap), where each country is colored by its population density. Display the three most dense and three least dense countries in the world in a table.

names(q2_html_data_extract)[names(q2_html_data_extract) == 'Country / dependency'] = "Country / Dependency"

Q2c_data = q2_html_data_extract %>% merge(q2_html_data_extract1, by = "Country / Dependency")

Q2c_data$pop.density = round(Q2c_data$Population / Q2c_data$total_field,3)
names.wiki = c("United States", "Kingdom of Denmark", "Republic of the Congo", "Sahrawi Arab Democratic Republic", "United Kingdom", "Somaliland", "Eswatini", "The Bahamas", "The Gambia", "Abkhazia", "United States Minor Outlying Islands", "State of Palestine", "Transnistria", "South Ossetia",  "Northern Cyprus",  "Artsakh", "East Timor", "Trinidad and Tobago")

names.map = c("USA", "Denmark", "Republic of Congo", "Western Sahara", "UK", "Somalia", "Swaziland", "Bahamas", "Gambia", "Georgia", "USA", "Palestine", "Moldova", "Georgia", "Cyprus",  "Armenia", "Timor-Leste", "Trinidad")
for (i in 1:length(names.wiki)){  Q2c_data["Country / Dependency"][Q2c_data["Country / Dependency"] == names.wiki[i]] = names.map[i]}
library("rworldmap")

Q2c_DATAmap = joinCountryData2Map(Q2c_data, joinCode = "NAME", nameJoinColumn = "Country / Dependency")
## 194 codes from your data successfully matched countries in the map
## 41 codes from your data failed to match with a country code in the map
## 54 codes from the map weren't represented in your data
# using lab SOL of mapCountryData example


Q2c_map = mapCountryData(mapToPlot =Q2c_DATAmap, nameColumnToPlot="pop.density",oceanCol='steelblue1', mapTitle = "pop.density plot- MAP ",missingCountryCol = "palegreen4", aspect = "variable")

summary(Q2c_data$pop.density)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##     0.026    32.972    85.907   329.034   223.684 19381.188

Recall, population density is the concentration of individuals within a species in a specific geographic locale. Population density data can be used to quantify demographic information and to assess relationships with ecosystems, human health, and infrastructure. To calculate density, you divide the number of objects by the measurement of the area. The population density of a country is the number of people in that country divided by the area in square kilometers.

In the aboce map plot i can see the density in square kilometers overall world is fine, disregarding high level countries as menieed in the pop density required vector summary.

Q2cMatrixsort = Q2c_data %>% arrange(desc(pop.density)) %>% select(`Country / Dependency`,Population,total_field,pop.density)
Q2cMatrixsort= data.matrix(Q2cMatrixsort)
DT::datatable(round(head(Q2cMatrixsort, 3),3),caption = " Three most densed",class = 'display')
DT::datatable(round(tail(Q2cMatrixsort, 3),3),caption = " Three least densed",class = 'display')

Merge the two data-frames into a new one (denoted the wikipedia data-frame) and add a new column called pop.density, that lists for each country the number of people per square km.

Show a world-map (you can use the package rworldmap), where each country is colored by its population density.

Display the three most dense and three least dense countries in the world in a table.

  1. (8pt) Suppose that we choose two random individuals uniformly at random from the world’s population, and suppose that each individual is located uniformly at random in her country - what will be the average geodesic distance between the two?

Simulate \(1000\) pairs of people: first, sample their nationalities, and then sample their locations conditional on their nationality.

To speed-up the simulation, you may ignore countries with area smaller than \(1000\) square km. Use the lists of names to match between the names from wikipedia

to the names returned by the map.where function. Additional parsing might be needed for the output of this function in a few cases.

Use the simulation to estimate the average geodesic distance. How does it compare to the average geodesic distance between two random points on earth?

Sample_people_location_cartesian=function(Radius,data){ 
  
x = as.data.frame(Radius * cos(data$long) * cos(data$lat))
colnames(x)=c("x") ;y = as.data.frame(Radius * sin(data$long) * cos(data$lat));
colnames(y)=c("y") ;z = as.data.frame(Radius *sin(data$lat));
colnames(z)=c("z"); DF=cbind(x,y,z) ;

return(DF)
}


# using map_data data in defult
world_df_2d=map_data("world") 
  
region_unique=unique(world_df_2d$region)

Q2d_geod_dists=matrix(nrow = 1000,ncol = 2)

for (i in 1:1000){sample_cont=c(sample(region_unique,2,replace = TRUE)); Q2d_geod_dists[i,]=sample_cont}

Y_2d_people=c() ; X_2d_people=c()
for (someone in 1:1000){
  
  first_coun=select(filter(world_df_2d, region == Q2d_geod_dists[someone,1]),c("region","long","lat"))
  
  first_long=sample(first_coun$long,1) ;   first_lat=sample(first_coun$lat,1)
  
  sec_coun=select(filter(world_df_2d, region == Q2d_geod_dists[someone,2]),c("region","long","lat"))
  
  sec_long=sample(sec_coun$long,1) ;  sec_lat=sample(sec_coun$lat,1)
  
  Y_2d_people=append(Y_2d_people,first_lat) ;  Y_2d_people=append(Y_2d_people,sec_lat)
  
  X_2d_people=append(X_2d_people,first_long) ;  X_2d_people=append(X_2d_people,sec_long) }
XY_2d=cbind(as.data.frame(X_2d_people),as.data.frame(Y_2d_people))

colnames(XY_2d)= c("long", "lat")

peoplesDF_2d=Sample_people_location_cartesian(6371, XY_2d)

peoplesDF_2d_2 = as.data.frame(cartesian_to_spherical(peoplesDF_2d))
person_p1_y = 1 ; person_p1_x = 1
while (person_p1_y < 1001){

  Q2d_geo_vector = c() 
  points1=peoplesDF_2d_2[person_p1_x,] ;   points2=peoplesDF_2d_2[person_p1_x + 1,]
  
  Q2d_geo_vector=append(Q2d_geo_vector,6371 * (acos(cos(points1$theta)*cos(points2$theta)+ sin(points1$theta)*sin(points2$theta)*cos(points1$phi - points2$phi))))
  
    person_p1_x = person_p1_x +2 ;  person_p1_y = person_p1_y + 1}
Q2d_geo_mean=mean(abs(Q2d_geo_vector))

q2d_table_results= round( data.frame(Q2d_geo_mean,mean(geodesic_sumulation_1a)), 2)

colnames(q2d_table_results)=c("2 people mean geodesic distance","2 points mean geodesic distance" )
DT::datatable(q2d_table_results,caption ="Q 2 d - comapre points and samples people allover the globe ;geodesic distance" ,class = 'display')

Following the above simulations, i have applied new simulations to sample random locations of people.

First, sample people 2 different location using map_data.

Second, convert them to Cartesian coordinates using help function.

Third, convert them to spherical coordinates sing help function as well.

Fourth, calculate the geodesic between pairs sampled spherical coordinates of 2 random locations and add the values to a distances vector , as well as calculate its mean.

Finally, as shown, i have compared to the mean of geodesic distance of 2 random points on earth -which seems almost the same in each simulation of 1K pairs - as a condition of the simulation ( each one outputs different value).

Interesting section which led to many new questions.

  1. (10t) Suppose that we didn’t know the countries’ areas and want to estimate the area of a country by sampling.

Write a function that receives as input a vector of country names, a number of samples \(n\), and outputs estimates for the areas of all countries in square km, based on drawing \(n\) random points on earth and recording the relative frequency of points falling within each country.

Run the function with \(n=10000\) samples and with the list of countries from the wikipedia data-frame.

Report separately the top 10 countries with the largest estimated areas and

the top 10 countries with the largest true areas in tables. Do you see an agreement between the two top-10 lists?

Next, merge the estimated areas results with the data-frame and plot the true area (x-axis) vs. the estimated area (y-axis) for all countries. What is the \(R^2\) between the two?
countries_Q2e_both_DFs=c()

for (countries in 1:length(names.wiki)){countries_Q2e_both_DFs = names.wiki[countries];  Q2c_data$Country[Q2c_data$Country == countries_Q2e_both_DFs] =names.map[countries] }
countries_Q2e =subset(Q2c_data,Country == "World") ;World_field =countries_Q2e$total_field
Estimate_countries_fields = function(countries,n){
  
  countries2 =data.frame(a = c(1)) 
  
  colnames(countries2) =c("country")
  
  j = 0 
  
  i = 0
  
  while (j < n){ 
    
    X = sample(seq(from = -180,to = 180, by = 0.1),1)
  
      #X_Y=  runif_on_sphere(1, d=2, r = 180)
      
       Y = sample(seq(from = -90,to = 90, by = 0.1),1)
      
        j =j + 1
        
       if (is.na(map.where(x = c(X),y = c(Y))) != TRUE){
        
    #  if (is.na(map.where(x = X_Y[,1],y = X_Y[,2])) != TRUE){
    
      names_of =map.where(x = c(X),y = c(Y))
      
      i = i + 1 
      
      countries2[i,] = names_of}
  }
  
  data =countries2[is.element(countries2$country, countries),]
  
  data =as.data.frame(data)
  
  country_table =table(data$data)
  
  ratio= (country_table/n)*World_field
  
  return(ratio)
}

countries_field_estimationDF =Estimate_countries_fields(Q2c_data$Country,10000)
Estim_fields =as.data.frame(head(countries_field_estimationDF[order(countries_field_estimationDF,decreasing = TRUE)],10))

Real =Q2c_data[order(Q2c_data$total_field, decreasing = TRUE), ] ; Real=Real[!(Real$Country=="World"),]

Real2 =Real[,c(1,3)] ;colnames(Real2)[1]= "Country"
High_estimate =as.data.frame(Estim_fields$Var1) 
colnames(High_estimate) =c("Top 10 highets estimated fields")

#DT::datatable(High_estimate)
High_real =as.data.frame(head(Real$Country,10)) ;colnames(High_real) =c("Top 10 highets real fields")


DT::datatable(cbind(High_estimate, High_real ))
For_merger_estimat=countries_field_estimationDF[order(countries_field_estimationDF,decreasing = TRUE)]
For_merger_estimat=as.data.frame(For_merger_estimat)
colnames(For_merger_estimat)[1]= "Country"
Megred_results=merge(For_merger_estimat,Real2,by = "Country")

colnames(Megred_results) =c("Country","Estimated_S_KM","Real_WIki_S_KM")
ggplot(Megred_results, aes(Real_WIki_S_KM, Estimated_S_KM)) +geom_point(aes(col=Real_WIki_S_KM, size=Estimated_S_KM),size=0.5) + 
  geom_smooth(aes(col=Estimated_S_KM),method="lm", se=F)  +   labs(  y="PREDICT",x="REAL",title="REAL(X lab) vs PREDICTION (Y lab)",  caption="Areas")+geom_jitter(aes(col=Real_WIki_S_KM, size=Estimated_S_KM))
cat("The  R^2 between the Real and Estimated vectors  is",round(summary(lm(Megred_results$Estimated_S_KM~Megred_results$Real_WIki_S_KM))$r.squared,3) )
## The  R^2 between the Real and Estimated vectors  is 0.901
In this question, i have applied function which samples 2 coordinates (X,Y) as the lat and long values.

Later, i compared between sampled coordinates to its possible location using \(map.where\) to make sure it is a country from the given list of Wikipedia.

I have computed the ratio of squared KM of each country the coordinates are from to predict to squared KM field of the given country i out of 10,000.

The calculation is the partial squared KM divide by given n ( 10,000) as a fixed C (0,1) and multiplied in the world total field (510072000), Which outputs the predict squared KM of a given country as a function of the sampling.

I can assume that my tests during the coding while used low number of simulations such as 100 and so on will not issued quite enough countries which will be also in the given countries from the WEB.

Also, tried to use runif_on_sphere but needed to run more than 1M iterations to output the required task.

Hence, used (https://stackoverflow.com/questions/68000621/distance-between-coordinates-in-dataframe-sequentially) to get the proper sampling for 10K as needed.

As seen above, there is a good match of more than 80% between 2 lists of TOP 10 real and estimated countries. It is make sense due to the \(while\) condition which make sure the mach will be high (check if the loction of a country exist in the given list), as well as the \(is. element()\) function.

As per the plot, i can might saw that this prediction model assumptions will be that the data will behave the same, while the prediction is a function of the real values. It is not real of course, country will not get bigger (Assuming no issues or wars causing total field increasing or decrasing).

As asked, recall that R-squared is a statistical measure of how close the data are to the fitted regression line. It is also known as the coefficient of determination.

The definition of R-squared is fairly straight-forward; it is the percentage of the response variable variation that is explained by a linear model.

\(R-squared = Explained variation / Total variation\) R-squared is always between 0 and 100%, while 0% indicates that the model explains none of the variability of the response data around its mean. 100% indicates that the model explains all the variability of the response data around its mean.

In general, the higher the R-squared, the better the model fits your data.

The more variance that is accounted for by the regression model the closer the data points will fall to the fitted regression line. Theoretically, if a model could explain 100% of the variance, the fitted values would always equal the observed values and, therefore, all the data points would fall on the fitted regression line, and here, the high value (more than 90%) is an partial evidance for the model fitting but with only one explanatory variable.

However, a high R-squared does not necessarily indicate that the model has a good fit. That might be a surprise, but look at the fitted line plot and residual plot below. The fitted line plot displays the relationship between semiconductor electron mobility and the natural log of the density for real experimental data.

U

Q3. Analysis and visualization of earthquake geographical data (33 pt)

resolutions

  1. (6pt) Read the earthquakes dataset from USGS. Select all earthquakes

in the world in 2022 of magnitude above 2.5, and save them to a .csv file (The magnitude is in Richter’s scale).

Load the dataset into an R data-frame, and display the five latest earthquakes and the five strongest earthquakes.
Q3_data = read.csv("query.csv")


Q3_data$time = as.POSIXct(Q3_data$time, format="%Y-%m-%dT%H:%M:%OSZ", tz="GMT")


Q3_data$updated = as.POSIXct(Q3_data$updated, format="%Y-%m-%dT%H:%M:%OSZ", tz="GMT")
DT::datatable(head(data.frame(Q3_data %>% arrange(desc(time))), 5),caption =' 5 recent earthquakes',class = 'display')
DT::datatable(head(data.frame(Q3_data %>% arrange(desc(mag))), 5),caption =' 5  strongest earthquakes',class = 'display')

Following the above tables, want to add that there was an earthquake close to IL in level of 3.1 during the exam!

please check :

https://www.israelhayom.co.il/news/local/article/11941714

  1. (6pt) Plot the earthquakes on top of on the world map. That is, each earthquake should be represented by a point, with the x axis representing longitude, the y axis representing latitude, and the size of the point representing the magnitude. This scatter plot should be on top of the world map, drawn using the geom_map function from the ggmap package. The data-frame representing the map can be generated as the output of the command map_data("world").
world = map_data("world")
ggplot() +  geom_map( data = world, map = world,  aes(long, lat, map_id = region),  color = "royalblue1", fill = "yellow1", size = 0.1  ) +  geom_point( data = Q3_data, aes(longitude, latitude, size=mag, color = "red4"),  alpha = 0.6 ) +  theme_void() +  theme(legend.position = "bottom")+  labs(title="Mmagnitude of earthquakes")

In the plot i can see the world map and the magnitudes of the earthquakes as a function of its locations. each size of each point on the map implies the level of the magnitude.

  1. (6pt) Repeat the plot from (b.), but this time make separate plots for different earthquakes, where they are grouped together by magnitude. That is, there should be a separate plot for each of the following magnitude ranges: between \(2\) and \(3\), between \(3\) and \(4\), …, , between \(7\) and \(8\).
What can you conclude about the locations of strong/weak earthquakes?
set.seed(1)
  
#3&2
Q3_data_2_3 = Q3_data %>% filter(mag<4 & mag > 1)
plot_2_3 = ggplot() +  geom_map( data = world, map = world,   aes(long, lat, map_id = region),
    color = "royalblue1", fill = "wheat1", size = 0.1) +   geom_point(
    data = Q3_data_2_3,aes(longitude, latitude, size=mag,color = "navyblue"),
    alpha = 0.5) +  theme_void() +  theme(legend.position = "right")+
  labs(title=paste("Mmagnitude of earthquakes of magnitude ranges ", 2, "&", 3)) 

#4&3
Q3_data_3_4 = Q3_data %>% filter(mag<5 & mag > 2)
plot_3_4 = ggplot() +  geom_map( data = world, map = world,   aes(long, lat, map_id = region),
    color = "royalblue1", fill = "wheat1", size = 0.1) +   geom_point(
    data = Q3_data_3_4,aes(longitude, latitude, size=mag,color = "navyblue"),
    alpha = 0.5) +  theme_void() +  theme(legend.position = "right")+
  labs(title=paste("Mmagnitude of earthquakes of magnitude ranges ", 3, "&", 4)) 

#5&4
Q3_data_4_5 = Q3_data %>% filter(mag<6 & mag > 3)
plot_4_5 = ggplot() +  geom_map( data = world, map = world,   aes(long, lat, map_id = region),
    color = "royalblue1", fill = "wheat1", size = 0.1) +   geom_point(
    data = Q3_data_4_5,aes(longitude, latitude, size=mag,color = "navyblue"),
    alpha = 0.5) +  theme_void() +  theme(legend.position = "right")+
  labs(title=paste("Mmagnitude of earthquakes of magnitude ranges ", 4, "&", 5)) 

#6&5
Q3_data_6_5 = Q3_data %>% filter(mag<7 & mag > 4)
plot_5_6 = ggplot() +  geom_map( data = world, map = world,   aes(long, lat, map_id = region),
    color = "royalblue1", fill = "wheat1", size = 0.1) +   geom_point(
    data = Q3_data_6_5,aes(longitude, latitude, size=mag,color = "navyblue"),
    alpha = 0.5) +  theme_void() +  theme(legend.position = "right")+
  labs(title=paste("Mmagnitude of earthquakes of magnitude ranges ", 5, "&", 6)) 

#6&7
Q3_data_6_7 = Q3_data %>% filter(mag<8 & mag > 5)
plot_6_7 = ggplot() +  geom_map( data = world, map = world,   aes(long, lat, map_id = region),
    color = "royalblue1", fill = "wheat1", size = 0.1) +   geom_point(
    data = Q3_data_6_7,aes(longitude, latitude, size=mag,color = "navyblue"),
    alpha = 0.5) +  theme_void() +  theme(legend.position = "right")+
  labs(title=paste("Mmagnitude of earthquakes of magnitude ranges ", 6, "&", 7))

#8&7
Q3_data_8_7 = Q3_data %>% filter(mag<9 & mag > 6)
plot_8_7 = ggplot() +  geom_map( data = world, map = world,   aes(long, lat, map_id = region),
    color = "royalblue1", fill = "wheat1", size = 0.1) +   geom_point(
    data = Q3_data_6_7,aes(longitude, latitude, size=mag,color = "navyblue"),
    alpha = 0.5) +  theme_void() +  theme(legend.position = "right")+
  labs(title=paste("Mmagnitude of earthquakes of magnitude ranges ", 7, "&", 8))


#par(mfrow=c(3,2))
#plot_2_3, plot_3_4 ,plot_4_5,plot_5_6, plot_6_7,plot_8_7
library("ggpubr")
ggarrange(plot_2_3, plot_3_4 ,plot_4_5,plot_5_6, plot_6_7,plot_8_7 ,ncol=2,nrow =3)

summary(Q3_data$mag)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.500   3.000   4.200   3.927   4.500   7.300
plot(Q3_data$time, Q3_data$mag)
As per the 7 plots, i can conclude about the locations of strong/weak earthquakes that
Level 2-3
Earthquakes with magnitude of 2-3 are mostly common in south America and west USA.
Level 3-4
Earthquakes with magnitudes od 3-4 also common in south America and west USA, as well as in the ocean and southern-eastern Asia, also Africa the south Europe and southern-eastern of Australia .
Level 4-6
Magnitudes Earthquakes of both 4-5 and 5-6 are common all over the globe world, with high values of magnitudes in eastern Asia.

Level 6-8

The strongest magnitude earthquakes, from 6 to 8 , shown in South America and southern & eastern Asia as well as in the oceans.
The final plots executed a question - Is time over the year affect the magnitude of an earthquake?
It wasn’t asked but can use as another tool to learn about earthquakes magnitude.

It can also displayed the earthquakes magnitude as a function of the time, as well as the summary of magnitude statistics :]

  1. (6pt) Parse the column representing the time to extract (i) the number of days since the beginning of the year, and (ii) the time of day, in units of hours since midnight from , as two new separate numeric columns.

That is, for example the date 2022-03-10T17:40:28.123Z will be converted to \(31+28+10=69\) days since the beginning of the year (the numbers of days in the first months are January:31, February:28, March:31, April:30, May:31), and to \(17+40/60+28/3600=17.6744\) hours since midnight (we ignore fractions of seconds here).

(Also e.) Plot pairwise correlations of the magnitude (mag), depth, day-of-year and time-of-day. (The depth column represents the depth below the ground in km of the center of the earthquake).

For which pairs of variables there is a significant correlation?

You may use the ggpairs command from the GGally package.
day_one_instructed = as.POSIXct("2022-01-01T00:00:00Z", format="%Y-%m-%dT%H:%M:%OSZ", tz="GMT")

date_accurate =difftime(Q3_data$time,day_one_instructed, units= c("days"))

Q3_data$exact_date= as.numeric(date_accurate)


####
q3_sec= abs(second(Q3_data$time))/3600
q3_min= minute(Q3_data$time)/60
q3_hour= hour(Q3_data$time)

Q3_data$hour =q3_sec + q3_min +q3_hour
#function for customizing the lower triangle 

q3d_func = function(data, mapping) {
  ggplot(data = data, mapping = mapping)+  geom_point(alpha = .5) +geom_smooth(method = "NULL", formula = y ~ x, 
                fill = "steelblue4", color = "violetred3", size = 0.8)
} 


correlation_value= "royalblue2"
ggpairs(Q3_data[, c("mag", "depth", "exact_date","hour")], lower = "blank",mapping = ggplot2::aes(color = correlation_value))

Recall that correlation shows the strength of a relationship between two variables and is expressed numerically by the correlation coefficient. The correlation coefficient’s values range between -1.0 and 1.0.
The strength of the relationship varies in degree based on the value of the correlation coefficient. For example, a value of 0.2 shows there is a positive correlation between two variables, but it is weak and likely unimportant. Analysts in some fields of study do not consider correlations important until the value surpasses at least 0.8. However, a correlation coefficient with an absolute value of 0.9 or greater would represent a very strong relationship

The most significant correlation is between ‘depth’ and ‘mag’ pair.

The scatter plots in the lower show that a fitted ‘loess’ (local regression, for smoothing purposes) line does not provide significant insights for the relationship between other different variables.

In the Hebrew version in github this part is section f, in the English moodle version this is section e.

(https://github.com/orzuk/njciRgcrh)

  1. ( or e.) (9pt) We want to test if there are times in the day in which earthquakes are more or less commons. Let \(t_i\) be the time of each earthquake \(i\) in the day (between \(00:00:00\) and \(23:59:59\)), in units of hours, calculated in (d.).

Test the null hypothesis \(H_0: t_i \sim U[0, 24]\) against the complex alternative

\(H_1: t_i \nsim U[0, 24]\). We test using the Pearson chi-square statistic:

$$

S = _{i=1}^{24}

$$

Q3_data$hour = floor(Q3_data$hour)

O_i_test = Q3_data %>% group_by(hour) %>%summarise(count=n())

ei = sum(O_i_test$count)/24

q3d_s_statistic = 0 

for (i in 1:24){
  q3d_s_statistic = q3d_s_statistic + ((O_i_test$count[i] - ei)^2)/ei
}

pval = round(pchisq(q=q3d_s_statistic, df=24, lower.tail=FALSE), digits=3)



# chi-square statistic

S_stat = round(q3d_s_statistic, digits=3)


q3e_result= data.frame(pval, S_stat)
DT::datatable(head(data.frame(q3e_result)),caption =' Q 3 E results',class='display')

Both S statistic (chi-square statistic) value and the P-value are as displayed above.

Recall that in null-hypothesis significance testing, the p-value is the probability of obtaining test results at least as extreme as the result actually observed, under the assumption that the null hypothesis is correct. A small p-value means that such an extreme observed outcome would be very unlikely under the null hypothesis.

If p-value is a statistical measurement used to validate a hypothesis against observed data and measures the probability of obtaining the observed results, assuming that the null hypothesis is true. The lower the p-value, the greater the statistical significance of the observed difference.

Thus, with an alpha of 0.1, we will reject the null hypothesis as our p-value is smaller.