set.seed(51) X = runif(200, min=0, max=4*pi) Y = sin(X) + rnorm(200, sd=0.3) Kreg = ksmooth(x=X,y=Y,kernel = "normal",bandwidth = 0.9) plot(X,Y,pch=20, main=paste("h =",.9)) lines(Kreg, lwd=4, col="purple") # Leave one out cross validation n = length(X) # n: sample size CV_err = rep(NA, n) for(i in 1:n){ X_val = X[i] Y_val = Y[i] # validation set X_tr = X[-i] Y_tr = Y[-i] # training set Y_val_predict = ksmooth(x=X_tr,y=Y_tr,kernel = "normal",bandwidth=0.9, x.points = X_val) CV_err[i] = (Y_val - Y_val_predict$y)^2 # we measure the error in terms of difference sqaure } mean(CV_err) # Choice of bandwidth using Leave one out cross validation h_seq = seq(from=0.2,to=2.0, by=0.1) # smoothing bandwidths we are using CV_err_h = rep(NA,length(h_seq)) for(j in 1:length(h_seq)){ h_using = h_seq[j] CV_err = rep(NA, n) for(i in 1:n){ X_val = X[i] Y_val = Y[i] # validation set X_tr = X[-i] Y_tr = Y[-i] # training set Y_val_predict = ksmooth(x=X_tr,y=Y_tr,kernel = "normal",bandwidth=h_using, x.points = X_val) CV_err[i] = (Y_val - Y_val_predict$y)^2 # we measure the error in terms of difference square } CV_err_h[j] = mean(CV_err) } CV_err_h plot(x=h_seq, y=CV_err_h, type="b", lwd=3, col="blue", xlab="Smoothing bandwidth", ylab="LOOCV prediction error") h_seq[which(CV_err_h == min(CV_err_h))] # Choice of bandwidth using k fold cross validation N_cv = 100 k = 20 cv_lab = sample(n,n,replace=F) %% k ## randomly split all the indices into k numbers h_seq = seq(from=0.2,to=2.0, by=0.1) CV_err_h = rep(0,length(h_seq)) for(i_tmp in 1:N_cv){ CV_err_h_tmp = rep(0, length(h_seq)) cv_lab = sample(n,n,replace=F) %% k for(i in 1:length(h_seq)){ h0 = h_seq[i] CV_err =0 for(i_cv in 1:k){ w_val = which(cv_lab==(i_cv-1)) X_tr = X[-w_val] Y_tr = Y[-w_val] X_val = X[w_val] Y_val = Y[w_val] kernel_reg = ksmooth(x = X_tr,y=Y_tr,kernel = "normal",bandwidth=h0, x.points=X_val) # WARNING! The ksmooth() function will order the x.points from # the smallest to the largest! CV_err = CV_err+mean((Y_val[order(X_val)]-kernel_reg$y)^2,na.rm=T) # na.rm = T: remove the case of 'NA' } CV_err_h_tmp[i] = CV_err/k } CV_err_h = CV_err_h+CV_err_h_tmp } CV_err_h = CV_err_h/N_cv plot(h_seq,CV_err_h, type="b", lwd=4, col="blue", xlab="Smoothing Bandwidth", ylab="5-CV Error") h_opt = h_seq[which(CV_err_h==min(CV_err_h))] h_opt