############# "R-code notes compilation for Week 9 March 22nd and 24th Compiled by: Bikram Halder, B.Math(hons.) 1st year" ############# ## Analysis of Bivariate Data # - Covariance # - Corelaion # - Method of least squares (linear relation between 2 variables) ## Many Data consists of 2 variables # One of them - Dependent variable aka Response variable # other one - independent variable aka Predictor or Explanatory variable ### Let's analyze the dataset of Maternal smoking vs Infant Health # Dataset - https://www.stat.berkeley.edu/~statlabs/labs.html#babiesI # Contains - 1236 infant data # Varaiables - bwt (birth weight in ounces) & smoke (smoking status of mother) # (From Stat Labs website ) df <- read.table( file = "https://www.stat.berkeley.edu/~statlabs/data/babiesI.data", header = TRUE ) # Reading the dataset head(df) unique(df$smoke) ### Extrancting unique elements from the smoke variable ## Here, # 0 - Didn't smoke # 1 - Smoker # 9 - Don't know df$number <- 1:1236 # Assigning an extra varible for the convenience of plotting ## loading... tidyverse for ggplot library(tidyverse) library(viridis) ggplot(data = df) + geom_point( aes( x = number, y = bwt, colour = as.factor(smoke) ) ) + scale_colour_viridis_d(option = "plasma") # Scatter plot of bwt # where, legend - smoke as factor ggplot( data = df, aes(x = bwt, fill = as.factor(smoke)) ) + geom_histogram(colour = "white") + scale_fill_viridis_d() # Histogram of bwt # filled - smoke as factor ## Exercise for both the 2 plots above. # - Change axis labels # - Add title # - Change legend ggplot( data = df, aes(x = bwt, fill = as.factor(smoke)) ) + geom_histogram(colour = "white") + scale_fill_viridis_d() + facet_wrap( ~ as.factor(smoke), nrow = 1 ) # Splitting the histogram with facet_wrap() w.r.t. smoke variable ## Exercise: # - Plot histogram proportions # - Change axis labels # - Add title # - Change legend df %>% ggplot(aes( x = as.factor(smoke), y = bwt, fill = as.factor(smoke) )) + geom_boxplot() + scale_fill_viridis_d() # Box-plot # So, Mean bwt (didn't smoke) > Mean bwt (smoker) ? df %>% ggplot(aes( x = as.factor(smoke), y = bwt, fill = as.factor(smoke) )) + geom_boxplot() + scale_fill_viridis(discrete = TRUE, alpha = 0.6) + geom_jitter(color = "black", size = 0.2, alpha = 0.9) + theme( legend.position = "none", plot.title = element_text(size = 11) ) + ggtitle("Boxplot with Data as jitter") + xlab("") # Box-plot with data ## To analyze it's data further # - Frame an hypothesis # - Execute & test on data ## Further Ref: # Book - Stat Labs: Mathematical Statistics Through Applications by Deborah Nolan and Terry P. Speed # Stat labs website - https://www.stat.berkeley.edu/users/statlabs/ ## loading UsingR for fat dataset library(UsingR) ?fat # Info about fat dataset names(fat) # Variables # Let's analyze the relation between neck and wrist # Comparing averages in 2 ways z <- mean(fat$neck) / mean(fat$wrist) z y <- mean(fat$neck / fat$wrist) y plt1 <- fat %>% ggplot( aes(x = wrist, y = neck) ) + geom_point() + geom_smooth(method = "lm") plt1 # plotting neck vs wrist in ggplot # method = 'lm' yields a linear regression layer ## Relationship seems linear plt2 <- fat %>% filter(20 <= age & age < 30) %>% ggplot( aes(x = wrist, y = neck) ) + geom_point() + geom_smooth(method = "lm") # loading library cowplot # for the function plot_grid() - used to present multiple plot in a frame library(cowplot) plot_grid(plt1, plt2, ncol = 2) # The variables seem related and also by a linear relationship. # Let's understand Covariance and Correlation with data fat %>% ggplot( aes( x = wrist, y = neck, col = rgb(.35, 0, 0) ) ) + theme(legend.position = "none") + geom_point() + geom_vline(xintercept = mean(fat$wrist)) + geom_hline(yintercept = mean(fat$neck)) # Observe: most of the datapoints are in 1st and 3rd quadrant # Let's slice and plot it again fat %>% slice(100:175) %>% ggplot( aes( x = wrist, y = neck, col = rgb(.35, 0, 0) ) ) + theme(legend.position = "none") + geom_point() + geom_vline(xintercept = mean(fat$wrist[100:175])) + geom_hline(yintercept = mean(fat$neck[100:175])) # Here also most of the datapoints lie 1st and 3rd quadrant # Plotting relationship between age and ankle with sliced data fat %>% slice(100:175) %>% ggplot( aes( x = ankle, y = age, col = rgb(.35, 0, 0) ) ) + theme(legend.position = "none") + geom_point() + geom_vline(xintercept = mean(fat$ankle[100:175])) + geom_hline(yintercept = mean(fat$age[100:175])) # Nope! most of the datapoints are not in 1st and 3rd quadrants # Thus, no linear relationship ### Covariance # - measurement of the difference between the two variables in the four regions. ### Correlation # - Covariance in standardised scale cor(fat$wrist, fat$neck) # Correlation between wrist and neck variabele in fat dataset cor(fat$wrist, fat$height) # Correlation between wrist and height variabele in fat dataset cor(fat$age, fat$ankle) # Correlation between age and ankle variabele in fat dataset # loading MASS to load the dataset Animals library(MASS) # Is it true that animals with larger bodies have larger brains? # Method 1: plot brain vs body variable Animals %>% ggplot( aes(x = brain, y = body) ) + geom_point() # Method 2: Computing Correlation between brain and body cor(Animals$body, Animals$brain) # Method 3: # - Assign rank and transform the dataset # - Compute correlation cor(rank(Animals$body), rank(Animals$brain)) cor(Animals$body, Animals$brain, method = "spearman") # AKA spearman correlation coefficient # i.e., measurement of relationship of monotonic data # Simple linear regrassion model of wrist vs neck (fat dataset) fat %>% ggplot( aes( x = wrist, y = neck, col = rgb(0, 0, 0.6) ) ) + theme(legend.position = "none") + geom_point() + geom_smooth( method = "lm", aes(col = rgb(0.4, 0, 0)), se = FALSE ) + geom_vline(xintercept = mean(fat$wrist)) + geom_hline(yintercept = mean(fat$neck)) ###