#############
R-code notes compilation for  Week 3 February 1st, 3rd
Compiled by: Deepta Basak. B.Math(hons.) 1st year
#############


Course = "B.Sc."
Number = 40
Smart = TRUE
#creating 4 variables
mode(Course)
#mode() gives the output that Course is a character datatype
mode(Number)
#Number has numeric datatype
mode(Smart)
#Smart has logical datatype


KA_D_31_1_22 = c(215,620,558, 1109,8813,350, 780, 420,
                 807,185,1993,515,1997,2886,371,156,
                 589,1746,838,964,296,128)
#vector containing district wise discharge data of COVID19 patients in Karnataka on 31/01/2022
#c is used to concatenate/combine individual values to get a vector
KA_D = c(215,620,558, 1109,8813,350, 780, 420, #district wise discharge data of COVID19 patients in Karnataka on 31/01/2022
         144,478,816,242,1051,249,1238, 315,
         807,185,1993,515,1997,2886,371,156,
         589,1746,838,964,296,128)
KA_D_31_1_22[c(1,3,5)] 
#selects elements at index 1,3,5 from the vector KA_D_31_1_22
KA_D_31_1_22[-c(1:20)] 
#removes the elements having index from 1 to 20 in the vector
KA_D_31_1_22p = (KA_D_31_1_22)/sum(KA_D_31_1_22)*100 
KA_D_31_1_22p
#vector of percentage of discharge across districts
#shows vectorization of computation
KA_D_31_1_22[KA_D_31_1_22 < 1000] 
#using logical operators to select elements of the vector: chooses all elements that return TRUE
sum(KA_D_31_1_22 > 2000) 
#converts logical operator to numeric and then adds
#returns the number of elements of the vector that satisfy the condition
max(KA_D_31_1_22) 
#returns the maximum of the vector
which(KA_D_31_1_22==max(KA_D_31_1_22)) 
#picks the position of the maximum

x = 3:7
x 
#vector is a collection of objects in the same mode/stored as one variable
#R stores everything as a vector
#x is a vector containing all positive integers from 3 to 7
s= seq(1,100, by=1) #by is the common difference
s
#seq() is used to create a vector containing all numbers from 1 to 100 at a gap of 1
s100 = seq(1,100,by=0.5)
s100
#creates a vector containing all numbers from 1 to 100 at a gap of 0.5
rep(6,7) 
#repeats the specified element in the first argument(6) as many times as the second argument(7 times)
rep(x,3) 
#the first argument can be a vector also

x = c(1,45,6,7,NA,99,0) 
x
#when we collect data, there may be missing values 
# missing values are represented by NA

#x==NA #cannot test equality to a missing value 
mean(x) 
#any computation with a missing value will return a missing value 
mean(x,na.rm=TRUE) 
#na.rm=TRUE removes all missing values before computing anything
is.na(x) 
#tests if a value is missing or not
mean(x[!is.na(x)]) 
#logical operators ! and is.na is used to calculate the mean of all the non-missing values

x = c("Siva", "looser", "3", "5")
mode(x)
#numbers have been changed to characters

as.numeric(x[3]) + as.numeric(x[4])
#x[3] +x[4] cannot perform computation
#as.numeric() converts the values to numeric before computing

A  = matrix(seq(3,5, by=1/10), 7,3) 
A
#vector of entries; number of rows(7) and number of columns(3)
#all entries of a matrix have to be of the same mode
#fills entries column wise from 3 to 5 at a gap of 1/10
B  = matrix(seq(3,5, by=1/10), ncol=3)
B
#creates a matrix having 3 columns with entries from 3 to 5 at a gap of 1/10
C  = matrix(seq(3,5, by=1/10), ncol=3, byrow=TRUE) 
C
#fills entries row wise
A[4,1] 
#selects the element at row=4 and column=1

xd = c("Siva", "looser", 3, 5) 
xd
#all elements of vector/matrix have to be of the same mode
#data frame is like a matrix/ rectangular array 
#each column in a data frame can be in a different mode

KA_District=c("Bagalakote","Ballari","Belagavi","Bengaluru Rural","Bengaluru Urban", 
              "Bidar","Chamarajanagara","Chikkaballapura","Chikkamagaluru","Chitradurga",
              "Dakshina Kannada"," Davanagere","Dharwada","Gadag","Hassana","Haveri",
              "Kalaburagi","Kodagu","Kolara","Koppala","Mandya","Mysuru","Raichuru",
              "Ramanagara","Shivamogga","Tumakuru","Udupi","Uttara Kannada","Vijayapura"   
              ,"Yadagiri")
# creates a vector with (corresponding) names of districts
KA_Discharge = data.frame(KA_District, KA_D) 
#creates a data frame having variables KA_District and KA_D
class(KA_Discharge) 
#specifies as data frame 
mode(KA_Discharge)
#specifies the mode as list
sapply(KA_Discharge,mode)
#applies the mode function to each variable/column in the data frame
names(KA_Discharge)=c("District", "Recovered") 
#renames the variables of the data frame 
KA_Discharge$Recovered
#displayes the Recovered column of the dataframe KA_Discharge
KA_Discharge[3,2]
#selects the 3rd entry of the 2nd column
KA_Discharge[3,]
#displays the entire 3rd row
KA_Discharge[,"Recovered"]
#displays the entire Recovered column

Deaths= c(346, 1712, 975, 903, 16593, 407,515, 446, 400,221, 1750,  611,
          1333,328,1291,652,856, 343, 647, 530, 673, 2494, 346, 
          338, 1105, 1172, 509, 793, 500, 206)
#vector containing the number of deaths
KA_Discharge$Deaths = Deaths 
#creates a variable Deaths in the dataframe KA_Discharge by filling the entries from the vector Deaths
head(KA_Discharge)
#displays the first 6 rows of KA_Discharge
kabulldf = read.csv(file="KAbulletin.csv", header = TRUE) 
#data extracted from Government of Karntaka  COVID-19 bulletins from 
#https://covid19.karnataka.gov.in/govt_bulletin/en
# Make sure to set the correct working directory using set_wd()
#read.csv -> used to read csv files
#file = "..." -> where csv file is located
#header = TRUE -> 1st row fo csv file provides the names of the variables
head(kabulldf) 
#reveals that the data frame has 8 variables
mode(kabulldf)
#mode is list

sapply(kabulldf, mode)
#displayes the mode of all the variables
kabulldf[which.max(kabulldf$"Today.s.Positives"),] 
#selects the district that has the max Today's positives

hpkabulldf = subset(kabulldf, kabulldf$"Today.s.Positives" > 1000) 
# QUERIES : finds objects that have certain properties
#subset command has two arguments: 1. the data frame, and 2. the condition for creating the subset
hpkabulldf = subset(kabulldf, Today.s.Positives > 1000) 
#this also works

IRDkabulldf = subset(kabulldf,
                    select = c("Total.Positives", "Total.Discharges", "Deaths"))
head(IRDkabulldf)
#IRDkadulldf is a subset of kabulldf having on the specified variables
#data frame created as a subset retains the original column names
okabulldf = kabulldf[order(kabulldf$Today.s.Positives),] 
head(okabulldf,4)
#reorders the rows of the dataframe corresponding to the order of the variable Today.s.Positives

#generating random data in R
#Goal: Generate samples from a given distribution
sample(1:6, 10, replace = T) 
#rolling of a fair die
#1:6 -> the vector to sample from {1,2,3,4,5,6}
#10 ->  how many samples to generate
#replace = T -> sample with replacement 
#probability is not specified, so it is a fair die
sample(c(0,1), 10, replace = TRUE, prob = c(0.3,0.7)) 
#tossing a biased coin 10 times
#chooses 0 with prob 0.3 and 1 with prob 0.7

#Binomial (n,p) distribution
rbinom(10,6,0.5)
#rbinom(m,size,prob)
#m -> number of samples required
#size -> number of bernoulli trials (n)
#prob -> probability of success (p)
rbinom(1000,10,0.5)
hist(rbinom(1000,10,0.5))

#Normal distribution
rnorm(1,10,5)

#Exponential distribution
rexp(1000,1)