diff --git a/NAMESPACE b/NAMESPACE index e6ecdc5..d1550e3 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -32,8 +32,6 @@ import(graphics) import(iterators) import(lifecycle) importFrom(dplyr,if_else) -importFrom(dplyr,mutate) -importFrom(dplyr,rename) importFrom(dplyr,select) importFrom(graphics,barplot) importFrom(lifecycle,deprecate_soft) diff --git a/NEWS b/NEWS index 9e8e044..7e57755 100644 --- a/NEWS +++ b/NEWS @@ -2,6 +2,9 @@ ## Version 2.0 (gordillo) +* 11.10.2020 +New vignette! + * 02.07.2020 During the recent lockdown we had the chance to inevest a enough time on the development of a new version of the package resemble. This new version comes diff --git a/R/RcppExports.R b/R/RcppExports.R index 3922fa8..fb3ff76 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -59,7 +59,6 @@ moving_cor_diss <- function(X, Y, w) { #' @usage get_col_largest_sd(X) #' @param X a matrix. #' @return a value indicating the index of the column with the largest standard deviation. -#' @useDynLib resemble #' @author Leonardo Ramirez-Lopez #' @keywords internal #' @useDynLib resemble @@ -72,7 +71,6 @@ get_col_largest_sd <- function(X) { #' @usage get_column_sds(X) #' @param X a a matrix. #' @return a vector of standard deviation values. -#' @useDynLib resemble #' @author Leonardo Ramirez-Lopez #' @keywords internal #' @useDynLib resemble @@ -85,7 +83,6 @@ get_column_sds <- function(X) { #' @usage get_column_means(X) #' @param X a a matrix. #' @return a vector of mean values. -#' @useDynLib resemble #' @author Leonardo Ramirez-Lopez #' @keywords internal #' @useDynLib resemble @@ -98,7 +95,6 @@ get_column_means <- function(X) { #' @usage get_column_sums(X) #' @param X a matrix. #' @return a vector of standard deviation values. -#' @useDynLib resemble #' @author Leonardo Ramirez-Lopez #' @keywords internal #' @useDynLib resemble @@ -114,8 +110,8 @@ get_column_sums <- function(X) { #' @usage #' opls_for_projection(X, Y, ncomp, scale, #' maxiter, tol, -#' pcSelmethod = "cumvar", -#' pcSelvalue = 0.99) +#' pcSelmethod = "var", +#' pcSelvalue = 0.01) #' @param X a matrix of predictor variables. #' @param Y a matrix of either a single or multiple response variables. #' @param ncomp the number of pls components. @@ -156,11 +152,10 @@ get_column_sums <- function(X) { #' and \code{Xscale}}. #' \item{\code{weights}}{ the matrix of wheights.} #' } -#' @useDynLib resemble #' @author Leonardo Ramirez-Lopez #' @keywords internal #' @useDynLib resemble -opls_for_projection <- function(X, Y, ncomp, scale, maxiter, tol, pcSelmethod = "cumvar", pcSelvalue = 0.99) { +opls_for_projection <- function(X, Y, ncomp, scale, maxiter, tol, pcSelmethod = "var", pcSelvalue = 0.01) { .Call('_resemble_opls_for_projection', PACKAGE = 'resemble', X, Y, ncomp, scale, maxiter, tol, pcSelmethod, pcSelvalue) } @@ -197,7 +192,6 @@ opls_for_projection <- function(X, Y, ncomp, scale, maxiter, tol, pcSelmethod = #' These objects contain information on the explained variance for the \code{X} and \code{Y} matrices respectively.} #' \item{\code{transf}}{ a \code{list} conating two objects: \code{Xcenter} and \code{Xscale}}. #' \item{\code{weights}}{ the matrix of wheights.}} -#' @useDynLib resemble #' @author Leonardo Ramirez-Lopez #' @keywords internal #' @useDynLib resemble @@ -234,7 +228,6 @@ opls_get_all <- function(X, Y, ncomp, scale, maxiter, tol) { #' \item{\code{Y}}{ the \code{Y} input.} #' \item{\code{transf}}{ a \code{list} conating two objects: \code{Xcenter} and \code{Xscale}}. #' \item{\code{weights}}{ the matrix of wheights.}} -#' @useDynLib resemble #' @author Leonardo Ramirez-Lopez #' @keywords internal #' @useDynLib resemble @@ -265,7 +258,6 @@ opls <- function(X, Y, ncomp, scale, maxiter, tol) { #' \item{\code{projection_mat}}{ the projection matrix.} #' \item{\code{transf}}{ a \code{list} conating two objects: \code{Xcenter} and \code{Xscale}}. #' } -#' @useDynLib resemble #' @author Leonardo Ramirez-Lopez #' @keywords internal #' @useDynLib resemble @@ -284,7 +276,6 @@ opls_get_basics <- function(X, Y, ncomp, scale, maxiter, tol) { #' @param scale a logical indicating whether the matrix of predictors used to create the regression model was scaled. #' @param Xscale if \code{scale = TRUE} a matrix of one row with the values that must be used for scaling \code{newdata}. #' @return a matrix of predicted values. -#' @useDynLib resemble #' @author Leonardo Ramirez-Lopez #' @keywords internal #' @useDynLib resemble @@ -303,7 +294,6 @@ predict_opls <- function(bo, b, ncomp, newdata, scale, Xscale) { #' @param Xscale if \code{scale = TRUE} a matrix of one row with the values that must be used for scaling \code{newdata}. #' @param Xcenter a matrix of one row with the values that must be used for centering \code{newdata}. #' @return a matrix corresponding to the new spectra projected onto the PLS space -#' @useDynLib resemble #' @author Leonardo Ramirez-Lopez #' @keywords internal #' @useDynLib resemble @@ -318,7 +308,6 @@ project_opls <- function(projection_mat, ncomp, newdata, scale, Xcenter, Xscale) #' @param projection_mat the projection matrix generated by the \code{opls_get_basics} function. #' @param xloadings the loadings matrix generated by the \code{opls_get_basics} function. #' @return a matrix of 1 row and 1 column. -#' @useDynLib resemble #' @author Leonardo Ramirez-Lopez #' @keywords internal #' @useDynLib resemble @@ -349,7 +338,6 @@ reconstruction_error <- function(x, projection_mat, xloadings) { #' @param Xcenter a matrix of one row with the values that must be used for centering \code{newdata}. #' @param Xscale if \code{scale = TRUE} a matrix of one row with the values that must be used for scaling \code{newdata}. #' @return a matrix of one row with the weights for each component between the max. and min. specified. -#' @useDynLib resemble #' @author Leonardo Ramirez-Lopez #' @keywords internal #' @useDynLib resemble @@ -387,7 +375,6 @@ get_pls_weights <- function(projection_mat, xloadings, coefficients, new_x, min_ #' \item{\code{st_rmse_seg}}{ the standardized RMSEs.} #' \item{\code{rsq_seg}}{ the coefficients of determination.} #' } -#' @useDynLib resemble #' @author Leonardo Ramirez-Lopez #' @keywords internal #' @useDynLib resemble @@ -413,7 +400,6 @@ opls_cv_cpp <- function(X, Y, scale, method, mindices, pindices, min_component, #' \item{\code{Ycenter}}{ if matrix of predictors was scaled, the centering vector used for \code{Y}.} #' \item{\code{Yscale}}{ if matrix of predictors was scaled, the scaling vector used for \code{Y}.} #' } -#' @useDynLib resemble #' @author Leonardo Ramirez-Lopez #' @keywords internal #' @useDynLib resemble @@ -434,7 +420,6 @@ gaussian_process <- function(X, Y, noisev = 0.001, scale = TRUE) { #' @param Ycenter if \code{center = TRUE} a matrix of one row with the values that must be used for accounting for the centering of the response variable. #' @param Yscale if \code{scale = TRUE} a matrix of one row with the values that must be used for accounting for the scaling of the response variable. #' @return a matrix of predicted values -#' @useDynLib resemble #' @author Leonardo Ramirez-Lopez #' @keywords internal #' @useDynLib resemble @@ -463,7 +448,6 @@ predict_gaussian_process <- function(Xz, alpha, newdata, scale, Xcenter, Xscale, #' \item{\code{st.rmse.seg}}{ the standardized RMSEs.} #' \item{\code{rsq.seg}}{ the coefficients of determination.} #' } -#' @useDynLib resemble #' @author Leonardo Ramirez-Lopez #' @keywords internal #' @useDynLib resemble @@ -477,8 +461,8 @@ gaussian_process_cv <- function(X, Y, mindices, pindices, noisev = 0.001, scale #' @usage #' pca_nipals(X, ncomp, center, scale, #' maxiter, tol, -#' pcSelmethod = "cumvar", -#' pcSelvalue = 0.99) +#' pcSelmethod = "var", +#' pcSelvalue = 0.01) #' @param X a matrix of predictor variables. #' @param Y a matrix of either a single or multiple response variables. #' @param ncomp the number of pls components. @@ -488,13 +472,13 @@ gaussian_process_cv <- function(X, Y, mindices, pindices, noisev = 0.001, scale #' @param pcSelmethod the method for selecting the number of components. #' Options are: \code{'cumvar'} (for selecting the number of principal components based on a given #' cumulative amount of explained variance) and \code{"var"} (for selecting the number of principal -#' components based on a given amount of explained variance). Default is \code{'cumvar'} +#' components based on a given amount of explained variance). Default is \code{'var'} #' @param pcSelvalue a numerical value that complements the selected method (\code{pcSelmethod}). #' If \code{"cumvar"} is chosen, it must be a value (larger than 0 and below 1) indicating the maximum #' amount of cumulative variance that the retained components should explain. If \code{"var"} is chosen, #' it must be a value (larger than 0 and below 1) indicating that components that explain (individually) #' a variance lower than this threshold must be excluded. If \code{"manual"} is chosen, it must be a value -#' specifying the desired number of principal components to retain. Default is 0.99. +#' specifying the desired number of principal components to retain. Default is 0.01. #' @return a list containing the following elements: #' \itemize{ #' \item{\code{pc_scores}}{ a matrix of principal component scores.} @@ -502,11 +486,10 @@ gaussian_process_cv <- function(X, Y, mindices, pindices, noisev = 0.001, scale #' \item{\code{variance}}{ a matrix of the variance of the principal components.} #' \item{\code{scale}}{ a \code{list} conating two objects: \code{center} and \code{scale}, which correspond to the vectors used to center and scale the input matrix.} #' } -#' @useDynLib resemble #' @author Leonardo Ramirez-Lopez #' @keywords internal #' @useDynLib resemble -pca_nipals <- function(X, ncomp, center, scale, maxiter, tol, pcSelmethod = "cumvar", pcSelvalue = 0.99) { +pca_nipals <- function(X, ncomp, center, scale, maxiter, tol, pcSelmethod = "var", pcSelvalue = 0.01) { .Call('_resemble_pca_nipals', PACKAGE = 'resemble', X, ncomp, center, scale, maxiter, tol, pcSelmethod, pcSelvalue) } @@ -524,6 +507,20 @@ which_min <- function(X) { .Call('_resemble_which_min', PACKAGE = 'resemble', X) } +#' @title A function to compute indices of minimum values of a distance vector +#' @description For internal use only +#' @usage +#' which_min_vector(X) +#' @param X a vector of distances +#' @return a vector of the indices of the nearest neighbors +#' @details +#' Used internally to find the nearest neighbors. +#' It searches in lower (or upper) triangular matrix. Therefore this must be the format of the +#' input data. The piece of code int \code{len = (sqrt(X.size()*8+1)+1)/2} generated an error in CRAN +#' since \code{sqrt} cannot be applied to integers. +#' @keywords internal +#' @useDynLib resemble +#' @author Antoine Stevens which_min_vector <- function(X) { .Call('_resemble_which_min_vector', PACKAGE = 'resemble', X) } diff --git a/R/cor_diss.R b/R/cor_diss.R index eb4f406..214ecf8 100644 --- a/R/cor_diss.R +++ b/R/cor_diss.R @@ -21,17 +21,19 @@ #' must be scaled. If \code{Xu} is provided the data is scaled on the basis #' of \mjeqn{Xr \cup Xu}{Xr U Xu}. #' @details -#' The correlation dissimilarity \mjeqn{cd}{cd} between two observations -#' \mjeqn{x_i}{x_i} and \mjeqn{x_j}{x_j} is computed as follows: +#' The correlation dissimilarity \mjeqn{d}{d} between two observations +#' \mjeqn{x_i}{x_i} and \mjeqn{x_j}{x_j} is based on the Perason's +#' correlation coefficient (\mjeqn{\rho}{\rho}) and it can be computed as +#' follows: #' -#' \mjdeqn{cd(x_i, x_j) = \frac{1}{2}(1 - cor(x_i, x_j))}{cd(x_i, x_j) = 1/2 (1 - cor (x_i, x_j))} +#' \mjdeqn{d(x_i, x_j) = \frac{1}{2}(1 - \rho(x_i, x_j))}{d(x_i, x_j) = 1/2 (1 - \rho(x_i, x_j))} #' -#' The avobe formlula is used when \code{ws = NULL}. +#' The above formula is used when \code{ws = NULL}. #' On the other hand (when \code{ws != NULL}) the moving correlation -#' dissimilarity \mjeqn{mcd}{mcd} between two observations \mjeqn{x_i}{x_i} and \mjeqn{x_j}{x_j} +#' dissimilarity between two observations \mjeqn{x_i}{x_i} and \mjeqn{x_j}{x_j} #' is computed as follows: #' -#' \mjdeqn{mcd(x_i, x_j) = \frac{1}{2 ws}\sum_{k=1}^{p-ws}(1 - cor(x_{i,(k:k+ws)}, x_{j,(k:k+ws)}))}{mcd(x_i, x_j) = 1/(2 ws)\sum_(k=1)^{p-ws}(1 - cor(x_(i,k:k+ws), x_(j,k:k+ws)))} +#' \mjdeqn{d(x_i, x_j; ws) = \frac{1}{2 ws}\sum_{k=1}^{p-ws}1 - \rho(x_{i,(k:k+ws)}, x_{j,(k:k+ws)})}{d(x_i, x_j) = 1/(2 ws)\sum_(k=1)^{p-ws}(1 - \rho(x_(i,k:k+ws), x_(j,k:k+ws)))} #' #' where \mjeqn{ws}{ws} represents a given window size which rolls sequentially #' from 1 up to \mjeqn{p - ws}{p - ws} and \mjeqn{p}{p} is the number of @@ -40,7 +42,7 @@ #' The function does not accept input data containing missing values. #' @return #' a matrix of the computed dissimilarities. -#' @author Antoine Stevens and Leonardo Ramirez-Lopez +#' @author Antoine Stevens and \href{https://orcid.org/0000-0002-5369-5120}{Leonardo Ramirez-Lopez} #' @examples #' \dontrun{ #' library(prospectr) diff --git a/R/f_diss.R b/R/f_diss.R index f5c63a2..7ff80dd 100644 --- a/R/f_diss.R +++ b/R/f_diss.R @@ -24,24 +24,22 @@ #' must be scaled. If \code{Xu} is provided the data is scaled on the basis #' of \mjeqn{Xr \cup Xu}{Xr U Xu}. #' @details -#' -#' the base [`dist`][base::dist()] -#' +#' #' The results obtained for Euclidean dissimilarity are equivalent to those -#' returned by the base [`dist`][base::dist()] function, but are scaled +#' returned by the [stats::dist()] function, but are scaled #' differently. However, \code{f_diss} is considerably faster (which can be #' advantageous when computing dissimilarities for very large matrices). The #' final scaling of the dissimilarity scores in \code{f_diss} where #' the number of variables is used to scale the squared dissimilarity scores. See -#' the examples section for a comparison between [`dist`][base::dist()] and +#' the examples section for a comparison between [stats::dist()] and #' \code{f_diss}. #' #' In the case of both the Euclidean and Mahalanobis distances, the scaled #' dissimilarity matrix \mjeqn{D}{D} between between observations in a given #' matrix \mjeqn{X}{X} is computed as follows: #' -#' \mjdeqn{D(x_i, x_j)^{2} = (x_i - x_j)M^{-1}(x_i - x_j)^{\mathrm{T}}}{D(x_i, x_j)^{2} = (x_i - x_j)M^{-1}(x_i - x_j)^T} -#' \mjdeqn{D_{scaled}(x_i, x_j) = \sqrt{\frac{1}{p}D(x_i, x_j)^{2}}}{D_scaled (x_i, x_j) = sqrt(1/p D(x_i, x_j)^2)} +#' \mjdeqn{d(x_i, x_j)^{2} = \sum (x_i - x_j)M^{-1}(x_i - x_j)^{\mathrm{T}}}{D(x_i, x_j)^{2} = \sum (x_i - x_j)M^{-1}(x_i - x_j)^T} +#' \mjdeqn{d_{scaled}(x_i, x_j) = \sqrt{\frac{1}{p}d(x_i, x_j)^{2}}}{d_scaled (x_i, x_j) = sqrt(1/p d(x_i, x_j)^2)} #' #' where \mjeqn{p}{p} is the number of variables in \mjeqn{X}{X}, \mjeqn{M}{M} is the identity #' matrix in the case of the Euclidean distance and the variance-covariance @@ -62,10 +60,10 @@ #' For the computation of the Mahalanobis distance, the mentioned method is #' used. #' -#' The cosine dissimilarity \mjeqn{S}{S} between two observations +#' The cosine dissimilarity \mjeqn{c}{c} between two observations #' \mjeqn{x_i}{x_i} and \mjeqn{x_j}{x_j} is computed as follows: #' -#' \mjdeqn{S(x_i, x_j) = cos^{-1}{\frac{\sum_{k=1}^{p}x_{i,k} x_{j,k}}{\sqrt{\sum_{k=1}^{p} x_{i,k}^{2}} \sqrt{\sum_{k=1}^{p} x_{j,k}^{2}}}}}{S(x_i, x_j) = cos^{-1} ((sum_(k=1)^p x_(i,k) x_(j,k))/(sum_(k=1)^p x_(i,k) sum_(k=1)^p x_(j,k)))} +#' \mjdeqn{c(x_i, x_j) = cos^{-1}{\frac{\sum_{k=1}^{p}x_{i,k} x_{j,k}}{\sqrt{\sum_{k=1}^{p} x_{i,k}^{2}} \sqrt{\sum_{k=1}^{p} x_{j,k}^{2}}}}}{c(x_i, x_j) = cos^{-1} ((sum_(k=1)^p x_(i,k) x_(j,k))/(sum_(k=1)^p x_(i,k) sum_(k=1)^p x_(j,k)))} #' #' where \mjeqn{p}{p} is the number of variables of the observations. #' The function does not accept input data containing missing values. @@ -74,7 +72,7 @@ #' #' @return #' a matrix of the computed dissimilarities. -#' @author Leonardo Ramirez-Lopez and Antoine Stevens +#' @author \href{https://orcid.org/0000-0002-5369-5120}{Leonardo Ramirez-Lopez} and Antoine Stevens #' @examples #' \dontrun{ #' library(prospectr) @@ -163,37 +161,37 @@ f_diss <- function(Xr, Xu = NULL, diss_method = "euclid", if (sum(is.na(Xr)) > 0) { stop("Matrices with missing values are not accepted") } - + n_method <- diss_method if (!n_method %in% c("euclid", "mahalanobis", "cosine")) { stop("'diss_method' must be one of: 'euclid', 'mahalanobis' or'cosine'") - + if (length(n_method) > 1) { } else { n_method <- diss_method[[1]] message(paste("More than one diss_method was specified, only", n_method, "was used.")) } } - + if (!is.logical(center)) { stop("'center' must be logical") } - + if (!is.logical(scale)) { stop("'scale' must be logical") } - + if (center | scale | n_method %in% c("mahalanobis", "euclid")) { X <- rbind(Xr, Xu) - + if (center) { X <- sweep(x = X, MARGIN = 2, FUN = "-", STATS = colMeans(X)) } - + if (scale) { X <- sweep(x = X, MARGIN = 2, FUN = "/", STATS = get_col_sds(X)) } - + if (n_method == "mahalanobis") { if (nrow(X) < ncol(X)) { stop("For computing the Mahalanobis distance, the total number of observations (rows) \n must be larger than the number of variables (columns).") @@ -204,7 +202,7 @@ f_diss <- function(Xr, Xu = NULL, diss_method = "euclid", } n_method <- "euclid" } - + if (!is.null(Xu)) { Xu <- X[(nrow(X) - nrow(Xu) + 1):nrow(X), , drop = FALSE] Xr <- X[1:(nrow(X) - nrow(Xu)), , drop = FALSE] @@ -213,7 +211,7 @@ f_diss <- function(Xr, Xu = NULL, diss_method = "euclid", } rm(X) } - + if (!is.null(Xu)) { ## FIXME check numerical precision in Rcpp ## in some cases it returns 0s as -1e-14 @@ -238,7 +236,7 @@ f_diss <- function(Xr, Xu = NULL, diss_method = "euclid", if (diss_method == "cosine") { rslt[is.nan(rslt)] <- 0 } - + rslt } @@ -250,25 +248,25 @@ f_diss <- function(Xr, Xu = NULL, diss_method = "euclid", #' @importFrom stats cov euclid_to_mahal <- function(X, sm_method = c("svd", "eigen")) { nms <- dimnames(X) - + if (ncol(X) > nrow(X)) { stop("In order to project the matrix to a Mahalanobis space, the number of observations of the input matrix must larger than its number of variables") } - + if (length(sm_method) > 1) { sm_method <- sm_method[1] } if (!(sm_method %in% c("svd", "eigen"))) { stop("sm_method must be one of 'svd', 'eigen'") } - + X <- as.matrix(X) vcv <- cov(X) sq_vcv <- sqrt_sm(vcv, method = sm_method) sq_S <- solve(sq_vcv) ms_x <- X %*% sq_S dimnames(ms_x) <- nms - + ms_x } @@ -286,7 +284,7 @@ sqrt_sm <- function(X, method = c("svd", "eigen")) { if (!(method %in% c("svd", "eigen"))) { stop("method must be one of 'svd', 'eigen'") } - + if (method == "svd") { ## REPLACE BY arma::svd(U, S, V, X, "dc") out <- svd(X) @@ -294,7 +292,7 @@ sqrt_sm <- function(X, method = c("svd", "eigen")) { U <- out$v return(U %*% (D^0.5) %*% t(U)) } - + if (method == "eigen") { out <- eigen(X) D <- diag(out$values) diff --git a/R/get_predictions.R b/R/get_predictions.R index 032063a..489b2ff 100644 --- a/R/get_predictions.R +++ b/R/get_predictions.R @@ -8,7 +8,7 @@ #' get_predictions(object) #' @param object an object of class \code{mbl} as returned by \code{mbl} #' @return a data.table of predicted values according to either \code{k} or \code{k_dist} -#' @author Leonardo Ramirez-Lopez and Antoine Stevens +#' @author \href{https://orcid.org/0000-0002-5369-5120}{Leonardo Ramirez-Lopez} and Antoine Stevens #' @seealso \code{\link{mbl}} #' @export ###################################################################### diff --git a/R/local_fit.R b/R/local_fit.R index 6c0fb9b..a293523 100644 --- a/R/local_fit.R +++ b/R/local_fit.R @@ -81,7 +81,7 @@ #' } #' } #' @return An object of class \code{local_fit} mirroring the input arguments. -#' @author Leonardo Ramirez-Lopez +#' @author \href{https://orcid.org/0000-0002-5369-5120}{Leonardo Ramirez-Lopez} #' @references #' Shenk, J., Westerhaus, M., and Berzaghi, P. 1997. Investigation of a LOCAL #' calibration procedure for near infrared instruments. Journal of Near Infrared diff --git a/R/local_helpers.R b/R/local_helpers.R index 55d88f2..ed2c6ee 100644 --- a/R/local_helpers.R +++ b/R/local_helpers.R @@ -46,7 +46,7 @@ get_neighbor_info <- function(Xr, Xu, diss_method, Yr = NULL, k_diss_max <- k_range <- k_diss <- NULL } - if_else(is_local, return_diss <- FALSE, return_diss <- return_dissimilarity) + ifelse(is_local, return_diss <- FALSE, return_diss <- return_dissimilarity) # use do.call to be able to change local to FALSE in ... (if provided) info_neighbors <- do.call( search_neighbors, @@ -132,174 +132,6 @@ get_neighbor_info <- function(Xr, Xu, diss_method, Yr = NULL, info_neighbors } -#' @title A function to create the sample sets for calibration/validation in -#' cross-validation. -#' @description for internal use only! This is stratified sampling based on the -#' values of a continous response variable (y). If group is provided, the -#' sampling is done based on the groups and the average of y per group. This -#' function is used to create groups forleave-group-out cross-validations (or -#' leave-group-of-groups-out cross-validation if group argument is provided). -#' @param y a matrix of one column with the response variable. -#' @param p the percentage of samples (or groups if group argument is used) to -#' retain in the hold_in set -#' @param number the number of sample groups to be crated -#' @param group the labels for each sample in \code{y} indicating the group each -#' observation belongs to. -#' @return a list with two matrices (\code{hold_in} and \code{hold_out}) giving -#' the indices of the observations in each column. The number of colums represents -#' the number of sampling repetitions. -#' @keywords internal -sample_str <- function(y, p, number, group = NULL) { - if (is.null(group)) { - nv <- floor((1 - p) * nrow(y)) - nv <- ifelse(nv < 1, 1, nv) - y_strata <- unique(quantile(y, - probs = seq(0, 1, length = (nv + 1)), - names = FALSE - )) - y_cuts <- cut(y, - breaks = y_strata, - labels = 1:(length(y_strata) - 1), - include.lowest = TRUE - ) - strata_category <- data.table( - original_order = 1:length(y), - strata = y_cuts - ) - - hold_out <- matrix(NA, nv, number) - hold_in <- matrix(NA, nrow(y) - nv, number) - colnames(hold_out) <- colnames(hold_in) <- paste0( - "Resample_", - seq(1:number) - ) - rownames(hold_out) <- paste0("index_", seq(1:nv)) - rownames(hold_in) <- paste0("index_", seq(1:nrow(hold_in))) - - indcs <- 1:nrow(y) - - for (jj in 1:number) { - strata_samples <- sort(tapply( - X = strata_category$original_order, - FUN = function(x) { - if (length(x) == 1) { - # this is required to keep the name of the - # strata, otherwise it fails - x <- c(x, x) - } - sample(x, size = 1) - }, - INDEX = strata_category$strata - )) - if (length(strata_samples) < nv) { - adds <- sample( - (1:length(y))[-strata_samples], - nv - length(strata_samples) - ) - strata_samples <- c(strata_samples, adds) - } - hold_out[, jj] <- strata_samples - hold_in[, jj] <- indcs[!indcs %in% strata_samples] - } - } else { - y_groups <- data.table( - y = y, - group = factor(group), - original_order = 1:length(y) - ) - gr_levels <- levels(y_groups$group) - n_levels <- nlevels(y_groups$group) - # Compute means per group and make the stratified sampling based on that - # vector of means - aggregated_y <- tapply( - X = y_groups$y, - INDEX = y_groups$group, - FUN = mean - ) - - nv <- floor((1 - p) * n_levels) - nv <- ifelse(nv < 1, 1, nv) - y_strata <- quantile(aggregated_y, - probs = seq(0, 1, length = (nv + 1)), - names = FALSE - ) - y_cuts <- cut(aggregated_y, - breaks = unique(y_strata), - labels = paste0("l_", 1:(length(y_strata) - 1)), - include.lowest = TRUE - ) - strata_category <- data.table( - original_order = 1:length(aggregated_y), - strata = factor(y_cuts) - ) - - hold_out <- matrix(0, nv, number) - colnames(hold_out) <- paste0("Resample_", seq(1:number)) - rownames(hold_out) <- paste0("index_", seq(1:nv)) - - list_hold_out <- list_hold_in <- NULL - for (jj in 1:number) { - strata_samples <- tapply(strata_category$original_order, - FUN = function(x) { - if (length(x) == 1) { - # this is required to keep the name of the - # strata, otherwise it fails - x <- c(x, x) - } - sample(x, size = 1) - }, - INDEX = strata_category$strata - ) - - if (length(strata_samples) < nv) { - adds <- sample( - (1:nrow(strata_category))[-strata_samples], - nv - length(strata_samples) - ) - strata_samples <- c(strata_samples, adds) - } - - strata_samples <- gr_levels[strata_samples] - - list_hold_out[[jj]] <- y_groups$original_order[y_groups$group %in% strata_samples] - list_hold_in[[jj]] <- y_groups$original_order[!y_groups$group %in% strata_samples] - } - - lengths_list_hold_out <- lengths(list_hold_out) - lengths_list_hold_in <- lengths(list_hold_in) - hold_out <- matrix(NA, max(lengths_list_hold_out), number) - hold_in <- matrix(NA, max(lengths_list_hold_in), number) - colnames(hold_out) <- colnames(hold_in) <- paste0( - "Resample_", - seq(1:number) - ) - rownames(hold_out) <- paste0("index_", 1:nrow(hold_out)) - rownames(hold_in) <- paste0("index_", 1:nrow(hold_in)) - indcs <- 1:nrow(y) - for (jj in 1:number) { - ## ramdomly select the replacements for the missing indexes - ## replacements are necessary! - jj_add_list_hold_out <- sample(list_hold_out[[jj]], - size = max(lengths_list_hold_out) - lengths_list_hold_out[jj], - replace = TRUE - ) - jj_add_list_hold_in <- sample(list_hold_in[[jj]], - size = max(lengths_list_hold_in) - lengths_list_hold_in[jj], - replace = TRUE - ) - - hold_out[1:lengths_list_hold_out[jj], jj] <- list_hold_out[[jj]] - hold_in[1:lengths_list_hold_in[jj], jj] <- list_hold_in[[jj]] - hold_out[-(1:lengths_list_hold_out[jj]), jj] <- jj_add_list_hold_out - hold_in[-(1:lengths_list_hold_in[jj]), jj] <- jj_add_list_hold_in - } - } - list( - hold_out = hold_out, - hold_in = hold_in - ) -} - #' @title Cross validation for PLS regression #' @description for internal use only! #' @keywords internal @@ -320,11 +152,12 @@ pls_cv <- function(x, y, ncomp, if (min_allowed < ncomp) { ncomp <- min_allowed } - cv_samples <- sample_str( + cv_samples <- sample_stratified( y = y, p = p, number = number, - group = group + group = group, + replacement = FALSE ) if (method == "wapls" & retrieve & tune) { @@ -515,7 +348,7 @@ get_wapls_weights <- function(pls_model, original_x, type = "w1", new_x = NULL, new_x = new_x, min_component = min_component, max_component = max_component, - scale = if_else(nrow(pls_model$transf$Xscale) == 1, TRUE, FALSE), + scale = ifelse(nrow(pls_model$transf$Xscale) == 1, TRUE, FALSE), Xcenter = pls_model$transf$Xcenter, Xscale = pls_model$transf$Xscale )[1, ] @@ -982,11 +815,12 @@ gaussian_pr_cv <- function(x, retrieve = c("final_model", "none")) { ## Create the resampling groups - cv_samples <- sample_str( + cv_samples <- sample_stratified( y = y, p = p, number = number, - group = group + group = group, + replacement = FALSE ) cv_results <- gaussian_process_cv( diff --git a/R/mbl.R b/R/mbl.R index 5d95d4f..2c4e5c4 100644 --- a/R/mbl.R +++ b/R/mbl.R @@ -123,7 +123,7 @@ #' components) and \code{value} (a numerical value that complements the selected #' method). The methods available are: #' \itemize{ -#' \item{\code{"opc"}:} {optimized principal component selection based on +#' \item{\code{"opc"}:} { optimized principal component selection based on #' Ramirez-Lopez et al. (2013a, 2013b). The optimal number of components #' (of set of observations) is the one for which its distance matrix #' minimizes the differences between the \code{Yr} value of each @@ -133,17 +133,17 @@ #' number of principal components to be tested. See the #' \code{\link{ortho_projection}} function for more details.} #' -#' \item{\code{"cumvar"}:}{selection of the principal components based +#' \item{\code{"cumvar"}:}{ selection of the principal components based #' on a given cumulative amount of explained variance. In this case, #' \code{value} must be a value (larger than 0 and below or equal to 1) -#' indicating the maximum amount of cumulative variance that the -#' retained components should explain.} +#' indicating the minimum amount of cumulative variance that the +#' combination of retained components should explain.} #' -#' \item{\code{"var"}:}{selection of the principal components based +#' \item{\code{"var"}:}{ selection of the principal components based #' on a given amount of explained variance. In this case, #' \code{value} must be a value (larger than 0 and below or equal to 1) -#' indicating the minimum amount of variance that a component should -#' explain in order to be retained.} +#' indicating the minimum amount of variance that a single component +#' should explain in order to be retained.} #' #' \item{\code{"manual"}:}{ for manually specifying a fix number of #' principal components. In this case, \code{value} must be a value @@ -151,12 +151,12 @@ #' indicating the minimum amount of variance that a component should #' explain in order to be retained.} #' } -#' The default list passed is \code{list(method = "cumvar", value = 0.99)}. +#' The default list passed is \code{list(method = "opc", value = min(dim(Xr), 40))}. #' Optionally, the \code{pc_selection} argument admits \code{"opc"} or #' \code{"cumvar"} or \code{"var"} or \code{"manual"} as a single character #' string. In such a case the default \code{"value"} when either \code{"opc"} or #' \code{"manual"} are used is 40. When \code{"cumvar"} is used the default -#' \code{"value"} is set to 0.99 and when \code{"var"} is used the default +#' \code{"value"} is set to 0.99 and when \code{"var"} is used, the default #' \code{"value"} is set to 0.01. #' @param group an optional factor (or character vector vector #' that can be coerced to \code{\link[base]{factor}} by \code{as.factor}) that @@ -343,7 +343,7 @@ #' observations for which the neighbors selected by the given dissimilarity #' threshold were outside the boundaries specified in the \code{k_range} #' argument. -#' @author Leonardo Ramirez-Lopez and Antoine Stevens +#' @author \href{https://orcid.org/0000-0002-5369-5120}{Leonardo Ramirez-Lopez} and Antoine Stevens #' @references #' Cleveland, W. S., and Devlin, S. J. 1988. Locally weighted regression: an #' approach to regression analysis by local fitting. Journal of the American @@ -379,27 +379,26 @@ #' \dontrun{ #' library(prospectr) #' data(NIRsoil) -#' -#' # Filter the data using the Savitzky and Golay smoothing filter with -#' # a window size of 11 spectral variables and a polynomial order of 3 -#' # (no differentiation). -#' sg <- savitzkyGolay(NIRsoil$spc, p = 3, w = 11, m = 0) -#' -#' # Replace the original spectra with the filtered ones -#' NIRsoil$spc <- sg -#' -#' Xu <- NIRsoil$spc[!as.logical(NIRsoil$train), ] -#' Yu <- NIRsoil$CEC[!as.logical(NIRsoil$train)] -#' -#' Yr <- NIRsoil$CEC[as.logical(NIRsoil$train)] -#' Xr <- NIRsoil$spc[as.logical(NIRsoil$train), ] -#' -#' Xu <- Xu[!is.na(Yu), ] -#' Xr <- Xr[!is.na(Yr), ] -#' -#' Yu <- Yu[!is.na(Yu)] -#' Yr <- Yr[!is.na(Yr)] -#' +#' +#' # Proprocess the data using detrend plus first derivative with Savitzky and +#' # Golay smoothing filter +#' sg_det <- savitzkyGolay( +#' detrend(NIRsoil$spc, +#' wav = as.numeric(colnames(NIRsoil$spc))), +#' m = 1, +#' p = 1, +#' w = 7 +#' ) +#' +#' NIRsoil$spc_pr <- sg_det +#' +#' # split into training and testing sets +#' test_x <- NIRsoil$spc_pr[NIRsoil$train == 0 & !is.na(NIRsoil$CEC),] +#' test_y <- NIRsoil$CEC[NIRsoil$train == 0 & !is.na(NIRsoil$CEC)] +#' +#' train_y <- NIRsoil$CEC[NIRsoil$train == 1 & !is.na(NIRsoil$CEC)] +#' train_x <- NIRsoil$spc_pr[NIRsoil$train == 1 & !is.na(NIRsoil$CEC),] +#' #' # Example 1 #' # A mbl implemented in Ramirez-Lopez et al. (2013, #' # the spectrum-based learner) @@ -407,103 +406,118 @@ #' # An exmaple where Yu is supposed to be unknown, but the Xu #' # (spectral variables) are known #' my_control <- mbl_control(validation_type = "NNv") -#' +#' #' ## The neighborhood sizes to test #' ks <- seq(40, 140, by = 20) -#' +#' #' sbl <- mbl( -#' Xr = Xr, Yr = Yr, Xu = Xu, +#' Xr = train_x, +#' Yr = train_y, +#' Xu = test_x, #' k = ks, #' method = local_fit_gpr(), #' control = my_control, +#' scale = TRUE #' ) #' sbl #' plot(sbl) #' get_predictions(sbl) -#' +#' #' # Example 1.2 #' # If Yu is actually known... #' sbl_2 <- mbl( -#' Xr = Xr, Yr = Yr, Xu = Xu, Yu = Yu, +#' Xr = train_x, +#' Yr = train_y, +#' Xu = test_x, +#' Yu = test_y, #' k = ks, #' method = local_fit_gpr(), -#' control = my_control, +#' control = my_control #' ) #' sbl_2 #' plot(sbl_2) -#' +#' #' # Example 2 #' # the LOCAL algorithm (Shenk et al., 1997) #' local_algorithm <- mbl( -#' Xr = Xr, Yr = Yr, Xu = Xu, Yu = Yu, +#' Xr = train_x, +#' Yr = train_y, +#' Xu = test_x, +#' Yu = test_y, #' k = ks, #' method = local_fit_wapls(min_pls_c = 3, max_pls_c = 15), #' diss_method = "cor", #' diss_usage = "none", -#' control = my_control, +#' control = my_control #' ) #' local_algorithm #' plot(local_algorithm) -#' +#' #' # Example 3 #' # A variation of the LOCAL algorithm (using the optimized pc #' # dissmilarity matrix) and dissimilarity matrix as source of #' # additional preditors #' local_algorithm_2 <- mbl( -#' Xr = Xr, Yr = Yr, Xu = Xu, Yu = Yu, +#' Xr = train_x, +#' Yr = train_y, +#' Xu = test_x, +#' Yu = test_y, #' k = ks, #' method = local_fit_wapls(min_pls_c = 3, max_pls_c = 15), #' diss_method = "pca", #' diss_usage = "predictors", -#' control = my_control, +#' control = my_control #' ) #' local_algorithm_2 -#' plot(local_algorithm) -#' +#' plot(local_algorithm_2) +#' #' # Example 4 #' # Running the mbl function in parallel with example 2 -#' library(future) -#' -#' n_cores <- availableCores() - 1 +#' n_cores <- parallel::detectCores() - 1 #' if (n_cores == 0) { #' n_cores <- 1 #' } +#' +#' library(doParallel) +#' clust <- makeCluster(n_cores) +#' registerDoParallel(clust) #' -#' # Set the number of cores according to the OS -#' if (.Platform$OS.type == "windows") { -#' library(doParallel) -#' clust <- makeCluster(n_cores) -#' registerDoParallel(clust) -#' } else { -#' library(doSNOW) -#' clust <- makeCluster(n_cores, type = "SOCK") -#' registerDoSNOW(clust) -#' getDoParWorkers() -#' } -#' +#' # Alernatively: +#' # library(doSNOW) +#' # clust <- makeCluster(n_cores, type = "SOCK") +#' # registerDoSNOW(clust) +#' # getDoParWorkers() +#' #' local_algorithm_par <- mbl( -#' Xr = Xr, Yr = Yr, Xu = Xu, Yu = Yu, +#' Xr = train_x, +#' Yr = train_y, +#' Xu = test_x, +#' Yu = test_y, #' k = ks, #' method = local_fit_wapls(min_pls_c = 3, max_pls_c = 15), #' diss_method = "cor", #' diss_usage = "none", -#' control = my_control, +#' control = my_control #' ) #' local_algorithm_par -#' +#' #' registerDoSEQ() #' try(stopCluster(clust)) -#' +#' #' # Example 5 #' # Using local pls distances #' with_local_diss <- mbl( -#' Xr = Xr, Yr = Yr, Xu = Xu, Yu = Yu, +#' Xr = train_x, +#' Yr = train_y, +#' Xu = test_x, +#' Yu = test_y, #' k = ks, #' method = local_fit_wapls(min_pls_c = 3, max_pls_c = 15), #' diss_method = "pls", #' diss_usage = "predictors", #' control = my_control, -#' .local = TRUE, pre_k = 150, +#' .local = TRUE, +#' pre_k = 150, #' ) #' with_local_diss #' plot(with_local_diss) @@ -633,28 +647,28 @@ mbl <- function(Xr, Yr, Xu, Yu = NULL, documentation = character(), ...) { f_call <- match.call() - + "%mydo%" <- get("%do%") if (control$allow_parallel & getDoParRegistered()) { "%mydo%" <- get("%dopar%") } - + if (missing(k)) { k <- NULL } - + if (missing(k_diss)) { k_diss <- NULL } - + if (missing(k_range)) { k_range <- NULL } - + input_dots <- list(...) ini_cntrl <- control ortho_diss_methods <- c("pca", "pca.nipals", "pls") - + if (".local" %in% names(input_dots)) { if (isTRUE(input_dots$.local)) { if (!"pre_k" %in% names(input_dots)) { @@ -667,60 +681,60 @@ mbl <- function(Xr, Yr, Xu, Yu = NULL, } } } - + # Sanity checks if (!is.logical(center)) { stop("'center' argument must be logical") } - + if (!is.logical(scale)) { stop("'scale' argument must be logical") } - + if (ncol(Xr) != ncol(Xu)) { stop("The number of predictor variables in Xr must be equal to the number of variables in Xu") } - + if (ncol(Xr) < 4) { stop("This function works only with matrices containing more than 3 predictor variables") } - + if (length(Yr) != nrow(Xr)) { stop("length(Yr) must be equal to nrow(Xr)") } - + if (any(is.na(Yr))) { stop("The current version of the mbl function does not handle NAs in the response variable of the reference observations (Yr)") } - + Xr <- as.matrix(Xr) Xu <- as.matrix(Xu) Yr <- as.matrix(Yr) - + n_xr <- nrow(Xr) n_xu <- nrow(Xu) n_total <- n_xr + n_xu - + rownames(Xr) <- 1:nrow(Xr) rownames(Xu) <- 1:nrow(Xu) - + if (is.null(colnames(Xr))) { colnames(Xr) <- 1:ncol(Xr) } - + if (is.null(colnames(Xu))) { colnames(Xu) <- 1:ncol(Xu) } - + if (sum(!colnames(Xu) == colnames(Xr)) != 0) { stop("Variable names in Xr do not match those in Xu") } - + diss_methods <- c( "pca", "pca.nipals", "pls", "cor", "euclid", "cosine", "sid" ) - + if (!is.character(diss_method) & !is.matrix(diss_method)) { mtds <- paste(diss_methods, collapse = ", ") stop(paste0( @@ -729,7 +743,7 @@ mbl <- function(Xr, Yr, Xu, Yu = NULL, " or a matrix" )) } - + if (!is.null(group)) { if (length(group) != nrow(Xr)) { stop(paste0( @@ -738,16 +752,16 @@ mbl <- function(Xr, Yr, Xu, Yu = NULL, )) } } - + if (length(pc_selection) != 2 | !is.list(pc_selection)) { stop("'pc_selection' must be a list of length 2") } - + if (!all(names(pc_selection) %in% c("method", "value")) | is.null(names(pc_selection))) { names(pc_selection)[sapply(pc_selection, FUN = is.character)] <- "method" names(pc_selection)[sapply(pc_selection, FUN = is.numeric)] <- "value" } - + pc_sel_method <- match.arg(pc_selection$method, c( "opc", "var", @@ -755,7 +769,7 @@ mbl <- function(Xr, Yr, Xu, Yu = NULL, "manual" )) pc_threshold <- pc_selection$value - + if (pc_sel_method %in% c("opc", "manual") & pc_selection$value > min(n_total, ncol(Xr))) { warning(paste0( "When pc_selection$method is 'opc' or 'manual', the value ", @@ -767,14 +781,14 @@ mbl <- function(Xr, Yr, Xu, Yu = NULL, )) pc_threshold <- min(n_total, ncol(Xr)) } - - + + match.arg(diss_usage, c("predictors", "weights", "none")) - + if (is.null(k) & is.null(k_diss)) { stop("Either k or k_diss must be specified") } - + k_max <- NULL if (!is.null(k)) { if (!is.null(k_diss)) { @@ -788,7 +802,7 @@ mbl <- function(Xr, Yr, Xu, Yu = NULL, k <- sort(k) k_max <- max(k) } - + k_diss_max <- NULL if (!is.null(k_diss)) { k_diss <- unique(sort(k_diss)) @@ -809,7 +823,7 @@ mbl <- function(Xr, Yr, Xu, Yu = NULL, } k_diss_max <- max(k_diss) } - + if (".local" %in% names(input_dots)) { if (isTRUE(input_dots$local)) { if (!"pre_k" %in% names(input_dots)) { @@ -826,33 +840,33 @@ mbl <- function(Xr, Yr, Xu, Yu = NULL, } } } - + if (!"local_fit" %in% class(method)) { stop("Object passed to method must be of class local_fit") } - + validation_type <- control$validation_type is_local_cv <- "local_cv" %in% validation_type is_nnv_val <- "NNv" %in% validation_type - + if (all(c("local_cv", "NNv") %in% control$validation_type)) { validation_type <- "both" } - + if (validation_type %in% c("NNv", "both") & nrow(Xu) < 3) { stop(paste0( "For nearest neighbor validation (control$validation_type == 'NNv')", " Xu must contain at least 3 observations" )) } - + if (!is.null(Yu)) { Yu <- as.matrix(Yu) if (length(Yu) != nrow(Xu)) { stop("Number of observations in Yu and Xu differ") } } - + if (!is.null(k)) { k <- as.integer(k) if (min(k) < 4) { @@ -865,9 +879,9 @@ mbl <- function(Xr, Yr, Xu, Yu = NULL, )) } } - + has_projection <- FALSE - + if (!is.matrix(diss_method)) { # when .local = TRUE, k_max is replaced with k_pre inside get_neighbor_info() neighborhoods <- get_neighbor_info( @@ -889,7 +903,7 @@ mbl <- function(Xr, Yr, Xu, Yu = NULL, } } else { diss_xr_xr <- NULL - + dim_diss <- dim(diss_method) if (diss_usage == "predictors") { if (diff(dim_diss) != 0 | dim_diss[1] != n_total | any(diag(diss_method) != 0)) { @@ -919,12 +933,12 @@ mbl <- function(Xr, Yr, Xu, Yu = NULL, append( neighborhoods, diss_to_neighbors(diss_xr_xu, - k = k, k_diss = k_diss, k_range = k_range, - spike = NULL, - return_dissimilarity = control$return_dissimilarity + k = k, k_diss = k_diss, k_range = k_range, + spike = NULL, + return_dissimilarity = control$return_dissimilarity ) ) - + if (gh) { neighborhoods <- NULL neighborhoods$gh$projection <- pls_projection( @@ -934,26 +948,26 @@ mbl <- function(Xr, Yr, Xu, Yu = NULL, scale = scale, ... ) neighborhoods$gh$gh_Xr <- f_diss(neighborhoods$gh$projection$scores, - Xu = t(colMeans(neighborhoods$gh$projection$scores)), - diss_method = "mahalanobis", - center = FALSE, scale = FALSE + Xu = t(colMeans(neighborhoods$gh$projection$scores)), + diss_method = "mahalanobis", + center = FALSE, scale = FALSE ) neighborhoods$gh$gh_Xu <- neighborhoods$gh$gh_Xr[-c(1:nrow(Xr))] neighborhoods$gh$gh_Xr <- neighborhoods$gh$gh_Xr[c(1:nrow(Xr))] neighborhoods$gh <- neighborhoods$gh[c("gh_Xr", "gh_Xu", "projection")] } - + neighborhoods$diss_xr_xr <- diss_xr_xr rm(diss_xr_xr) rm(diss_method) gc() } - + if (!is.null(k)) { smallest_neighborhood <- neighborhoods$neighbors[1:min(k), , drop = FALSE] smallest_n_neighbors <- colSums(!is.na(smallest_neighborhood)) } - + if (!is.null(k_diss)) { min_diss <- neighborhoods$neighbors_diss <= min(k_diss) if (!is.null(spike)) { @@ -965,8 +979,8 @@ mbl <- function(Xr, Yr, Xu, Yu = NULL, smallest_n_neighbors[smallest_n_neighbors < min(k_range)] <- min(k_range) smallest_n_neighbors[smallest_n_neighbors > max(k_range)] <- max(k_range) } - - + + if (is_local_cv) { min_n_samples <- floor(min(smallest_n_neighbors) * control$p) - 1 min_cv_samples <- floor(min(k, k_range) * (1 - control$p)) @@ -980,7 +994,7 @@ mbl <- function(Xr, Yr, Xu, Yu = NULL, } else { min_n_samples <- smallest_n_neighbors - 1 } - + if (method$method %in% c("pls", "wapls")) { max_pls <- max(method$pls_c) if (any(min_n_samples < max_pls)) { @@ -992,7 +1006,7 @@ mbl <- function(Xr, Yr, Xu, Yu = NULL, )) } } - + if (!".local" %in% names(input_dots)) { iter_neighborhoods <- ith_mbl_neighbor( @@ -1012,7 +1026,7 @@ mbl <- function(Xr, Yr, Xu, Yu = NULL, group = group ) } - + r_fields <- c( "o_index", "k_diss", "k_original", "k", "npls", "min_pls", "max_pls", "yu_obs", "pred", "yr_min_obs", "yr_max_obs", @@ -1021,15 +1035,15 @@ mbl <- function(Xr, Yr, Xu, Yu = NULL, "y_farthest", "diss_nearest", "diss_farthest", "loc_rmse_cv", "loc_st_rmse_cv", "loc_n_components", "rep" ) - + n_ith_result <- ifelse(is.null(k_diss), length(k), length(k_diss)) - + template_pred_results <- data.table(matrix(NA, n_ith_result, length(r_fields), - dimnames = list(NULL, r_fields) + dimnames = list(NULL, r_fields) )) - + template_pred_results$rep[1] <- 0 - + if (!is.null(k_diss)) { template_pred_results$k_diss <- k_diss } else { @@ -1038,11 +1052,11 @@ mbl <- function(Xr, Yr, Xu, Yu = NULL, pg_bar_width <- 10 # to_erase <- getOption("width") - pg_bar_width - (2 * nchar(nrow(Xu))) - 2 to_erase <- pg_bar_width + (2 * nchar(nrow(Xu))) + 8 - to_erase <- paste(rep(" ", to_erase), collpase = "") - + to_erase <- paste(rep(" ", to_erase), collapse = "") + cat("\033[32m\033[3mPredicting...\n\033[23m\033[39m") n_iter <- nrow(Xu) - + pred_obs <- foreach( i = 1:n_iter, ith_observation = iter_neighborhoods, @@ -1073,23 +1087,22 @@ mbl <- function(Xr, Yr, Xu, Yu = NULL, ith_group = ith_observation$ith_group, ... ) - + ith_pred_results$loc_n_components[] <- ith_observation$ith_components additional_results$ith_neig_indices <- ith_observation$ith_neig_indices additional_results$ith_neigh_diss <- ith_observation$ith_neigh_diss } - + if (control$progress) { cat(paste0("\033[34m\033[3m", i, "/", n_iter, "\033[23m\033[39m")) pb <- txtProgressBar(width = pg_bar_width, char = "\033[34m_\033[39m") } - + if (!is.null(k_diss)) { ith_diss <- ith_observation$ith_neigh_diss if (!is.null(spike)) { ith_diss[1:length(spike)] <- 0 } - ith_pred_results$k_original <- sapply(k_diss, FUN = function(x, d) sum(d < x), d = ith_diss) ith_pred_results$k <- ith_pred_results$k_original ith_pred_results$k[ith_pred_results$k_original < min(k_range)] <- min(k_range) @@ -1097,12 +1110,12 @@ mbl <- function(Xr, Yr, Xu, Yu = NULL, } else { ith_pred_results$k <- k } - + for (kk in 1:nrow(ith_pred_results)) { if (control$progress) { setTxtProgressBar(pb, kk / nrow(ith_pred_results)) } - + # If the sample has not been predicted before, # then create a model and predict it (useful only when k_diss is used) current_k <- ith_pred_results$k[kk] @@ -1131,13 +1144,13 @@ mbl <- function(Xr, Yr, Xu, Yu = NULL, ith_pred_results$y_nearest[kk] <- i_k_yr[which.min(kth_diss)] ith_pred_results$index_nearest_in_Xr[kk] <- ith_observation$ith_neig_indices[which.min(kth_diss)] ith_pred_results$index_farthest_in_Xr[kk] <- ith_observation$ith_neig_indices[which.max(kth_diss)] - + if (!is.null(group)) { i_k_group <- factor(ith_observation$ith_group[1:current_k]) } else { i_k_group <- NULL } - + if (diss_usage == "weights") { # Weights are defined according to a tricubic function # as in Cleveland and Devlin (1988) and Naes and Isaksson (1990). @@ -1147,7 +1160,7 @@ mbl <- function(Xr, Yr, Xu, Yu = NULL, } else { kth_weights <- rep(1, current_k) } - + # local fit i_k_pred <- fit_and_predict( x = i_k_xr, @@ -1167,9 +1180,9 @@ mbl <- function(Xr, Yr, Xu, Yu = NULL, pls_max_iter = 1, pls_tol = 1e-6 ) - + ith_pred_results$pred[kk] <- i_k_pred$prediction - + selected_pls <- NULL if (is_local_cv) { if (control$tune_locally) { @@ -1177,7 +1190,7 @@ mbl <- function(Xr, Yr, Xu, Yu = NULL, } else { best_row <- ifelse(method$method == "pls", method$pls_c, 1) } - + if (method$method == "pls") { ith_pred_results$npls[kk] <- i_k_pred$validation$cv_results$npls[best_row] selected_pls <- ith_pred_results$npls[kk] @@ -1187,7 +1200,7 @@ mbl <- function(Xr, Yr, Xu, Yu = NULL, ith_pred_results$max_pls[kk] <- i_k_pred$validation$cv_results$max_component[best_row] selected_pls <- i_k_pred$validation$cv_results[best_row, 1:2] } - + ith_pred_results$loc_rmse_cv[kk] <- i_k_pred$validation$cv_results$rmse_cv[best_row] ith_pred_results$loc_st_rmse_cv[kk] <- i_k_pred$validation$cv_results$st_rmse_cv[best_row] } else { @@ -1201,14 +1214,14 @@ mbl <- function(Xr, Yr, Xu, Yu = NULL, selected_pls <- method$pls_c } } - + if (is_nnv_val) { if (!is.null(group)) { out_group <- which(i_k_group == i_k_group[[ith_observation$local_index_nearest]]) } else { out_group <- ith_observation$local_index_nearest } - + nearest_pred <- fit_and_predict( x = i_k_xr[-out_group, ], y = i_k_yr[-out_group, , drop = FALSE], @@ -1223,7 +1236,7 @@ mbl <- function(Xr, Yr, Xu, Yu = NULL, pls_max_iter = 1, pls_tol = 1e-6 )$prediction - + ith_pred_results$y_nearest_pred[kk] <- nearest_pred / kth_weights[1] } } else { @@ -1233,12 +1246,12 @@ mbl <- function(Xr, Yr, Xu, Yu = NULL, ith_pred_results$k_diss[kk] <- ith_k_diss } } - + if (control$progress) { if (kk == nrow(ith_pred_results) & i != n_iter) { - cat(to_erase, "\r") + cat("\r", to_erase, "\r") } - + if (i == n_iter) { cat("\n") } @@ -1250,34 +1263,30 @@ mbl <- function(Xr, Yr, Xu, Yu = NULL, additional_results = additional_results ) } - - + iteration_order <- sapply(pred_obs, - FUN = function(x) x$results$o_index[1] - ) - - - results_table <- do.call("rbind", lapply(iteration_order, - FUN = function(x, ii) x[[ii]]$results, - x = pred_obs - )) - - + FUN = function(x) x$results$o_index[1]) + + pred_obs <- pred_obs[order(iteration_order, decreasing = FALSE)] + + results_table <- do.call("rbind", lapply(pred_obs, + FUN = function(x) x$results)) + if (".local" %in% names(input_dots) & diss_method %in% ortho_diss_methods) { diss_xr_xu <- do.call( "cbind", lapply(iteration_order, - FUN = function(x, m, ii) { - idc <- x[[ii]]$additional_results$ith_neig_indices - d <- x[[ii]]$additional_results$ith_neigh_diss - m[idc] <- d - m - }, - x = pred_obs, - m = matrix(NA, nrow(Xr), 1) + FUN = function(x, m, ii) { + idc <- x[[ii]]$additional_results$ith_neig_indices + d <- x[[ii]]$additional_results$ith_neigh_diss + m[idc] <- d + m + }, + x = pred_obs, + m = matrix(NA, nrow(Xr), 1) ) ) - class(diss_xr_xu) <- c("localortho_diss", "matrix") + class(diss_xr_xu) <- c("local_ortho_diss", "matrix") dimnames(diss_xr_xu) <- list( paste0("Xr_", 1:nrow(diss_xr_xu)), paste0("Xu_", 1:ncol(diss_xr_xu)) @@ -1285,17 +1294,17 @@ mbl <- function(Xr, Yr, Xu, Yu = NULL, neighborhoods$neighbors <- do.call( "cbind", lapply(iteration_order, - FUN = function(x, m, ii) { - idc <- x[[ii]]$additional_results$ith_neig_indices - m[1:length(idc)] <- idc - m - }, - x = pred_obs, - m = matrix(NA, max(results_table$k), 1) + FUN = function(x, m, ii) { + idc <- x[[ii]]$additional_results$ith_neig_indices + m[1:length(idc)] <- idc + m + }, + x = pred_obs, + m = matrix(NA, max(results_table$k), 1) ) ) } - + out <- c( if (is.null(Yu)) { "yu_obs" @@ -1317,21 +1326,21 @@ mbl <- function(Xr, Yr, Xu, Yu = NULL, }, "rep" ) - + results_table[, (out) := NULL] if (!is.null(k_diss)) { param <- "k_diss" results_table <- lapply(get(param), - FUN = function(x, sel, i) x[x[[sel]] == i, ], - x = results_table, - sel = param + FUN = function(x, sel, i) x[x[[sel]] == i, ], + x = results_table, + sel = param ) names(results_table) <- paste0("k_diss_", k_diss) p_bounded <- sapply(results_table, - FUN = function(x, k_range) { - sum(x$k_original <= k_range[1] | x$k_original >= k_range[2]) - }, - k_range = k_range + FUN = function(x, k_range) { + sum(x$k_original <= k_range[1] | x$k_original >= k_range[2]) + }, + k_range = k_range ) col_ks <- data.table( k_diss = k_diss, @@ -1340,14 +1349,14 @@ mbl <- function(Xr, Yr, Xu, Yu = NULL, } else { param <- "k" results_table <- lapply(get(param), - FUN = function(x, sel, i) x[x[[sel]] == i, ], - x = results_table, - sel = param + FUN = function(x, sel, i) x[x[[sel]] == i, ], + x = results_table, + sel = param ) names(results_table) <- paste0("k_", k) col_ks <- data.table(k = k) } - + if (validation_type %in% c("NNv", "both")) { nn_stats <- function(x) { nn_rmse <- (mean((x$y_nearest - x$y_nearest_pred)^2))^0.5 @@ -1355,17 +1364,17 @@ mbl <- function(Xr, Yr, Xu, Yu = NULL, nn_rsq <- (cor(x$y_nearest, x$y_nearest_pred))^2 c(nn_rmse = nn_rmse, nn_st_rmse = nn_st_rmse, nn_rsq = nn_rsq) } - + loc_nn_res <- do.call("rbind", lapply(results_table, FUN = nn_stats)) loc_nn_res <- cbind(col_ks, - rmse = loc_nn_res[, "nn_rmse"], - st_rmse = loc_nn_res[, "nn_st_rmse"], - r2 = loc_nn_res[, "nn_rsq"] + rmse = loc_nn_res[, "nn_rmse"], + st_rmse = loc_nn_res[, "nn_st_rmse"], + r2 = loc_nn_res[, "nn_rsq"] ) } else { loc_nn_res <- NULL } - + if (validation_type %in% c("local_cv", "both")) { mean_loc_res <- function(x) { mean_loc_rmse <- mean(x$loc_rmse_cv) @@ -1374,37 +1383,37 @@ mbl <- function(Xr, Yr, Xu, Yu = NULL, } loc_res <- do.call("rbind", lapply(results_table, mean_loc_res)) loc_res <- cbind(col_ks, - rmse = loc_res[, "loc_rmse"], - st_rmse = loc_res[, "loc_st_rmse"] + rmse = loc_res[, "loc_rmse"], + st_rmse = loc_res[, "loc_st_rmse"] ) } else { loc_res <- NULL } - + if (!is.null(Yu)) { for (i in 1:length(results_table)) { results_table[[i]]$yu_obs <- Yu } yu_stats <- function(x) { - yu_rmse <- (mean((x$yu_obs - x$pred)^2))^0.5 - yu_st_rmse <- yu_rmse / diff(range(x$yu_obs)) - yu_rsq <- (cor(x$yu_obs, x$pred))^2 + yu_rmse <- mean((x$yu_obs - x$pred)^2, na.rm = TRUE)^0.5 + yu_st_rmse <- yu_rmse / diff(range(x$yu_obs, na.rm = TRUE)) + yu_rsq <- cor(x$yu_obs, x$pred, use = "complete.obs")^2 c(yu_rmse = yu_rmse, yu_st_rmse = yu_st_rmse, yu_rsq = yu_rsq) } pred_res <- do.call("rbind", lapply(results_table, yu_stats)) pred_res <- cbind(col_ks, - rmse = pred_res[, "yu_rmse"], - st_rmse = pred_res[, "yu_st_rmse"], - r2 = pred_res[, "yu_rsq"] + rmse = pred_res[, "yu_rmse"], + st_rmse = pred_res[, "yu_st_rmse"], + r2 = pred_res[, "yu_rsq"] ) } else { pred_res <- NULL } - - if ("localortho_diss" %in% class(diss_xr_xu)) { + + if ("local_ortho_diss" %in% class(diss_xr_xu)) { diss_method <- paste0(diss_method, " (locally computed)") } - + if (control$return_dissimilarity) { diss_list <- list( diss_method = diss_method, @@ -1416,7 +1425,7 @@ mbl <- function(Xr, Yr, Xu, Yu = NULL, } else { diss_list <- NULL } - + colnames(neighborhoods$neighbors) <- paste0("Xu_", 1:nrow(Xu)) rownames(neighborhoods$neighbors) <- paste0("k_", 1:nrow(neighborhoods$neighbors)) results_list <- list( @@ -1437,9 +1446,9 @@ mbl <- function(Xr, Yr, Xu, Yu = NULL, results = results_table, documentation = documentation ) - + attr(results_list, "call") <- f_call class(results_list) <- c("mbl", "list") - + results_list } diff --git a/R/mbl_control.R b/R/mbl_control.R index fda5e9c..968d3a6 100644 --- a/R/mbl_control.R +++ b/R/mbl_control.R @@ -25,14 +25,14 @@ #' below). #' @param tune_locally a logical. It only applies when #' \code{validation_type = "local_cv"} and "pls" or "wapls" fitting algorithms are -#' used. If \code{TRUE}, the the parameters of the local pls-based models +#' used. If \code{TRUE}, then the parameters of the local pls-based models #' (i.e. pls factors for the "pls" method and minimum and maximum pls factors -#' for the "wapls" method). Default is #' \code{TRUE}. +#' for the "wapls" method) are optimized. Default is \code{TRUE}. #' @param number an integer indicating the number of sampling iterations at #' each local segment when \code{"local_cv"} is selected in the #' \code{validation_type} argument. Default is 10. -#' @param p a numeric value indicating the percentage of observations to be retained -#' at each sampling iteration at each local segment when \code{"local_cv"} +#' @param p a numeric value indicating the percentage of calibration observations +#' to be retained at each sampling iteration at each local segment when \code{"local_cv"} #' is selected in the \code{validation_type} argument. Default is 0.75 (i.e. 75 "\%"). #' @param range_prediction_limits a logical. It indicates whether the prediction #' limits at each local regression are determined by the range of the response @@ -57,24 +57,28 @@ #' the group of neighbors of each observation to be predicted, the nearest observation #' (i.e. the most similar observation) is excluded and then a local model is fitted #' using the remaining neighbors. This model is then used to predict the value -#' of the target response variable of the nearest observation. These predicted +#' of the response variable of the nearest observation. These predicted #' values are finally cross validated with the actual values (See Ramirez-Lopez #' et al. (2013a) for additional details). This method is faster than #' \code{"local_cv"}.} #' \item{Local leave-group-out cross-validation (\code{"local_cv"}):}{ The #' group of neighbors of each observation to be predicted is partitioned into #' different equal size subsets. Each partition is selected based on a -#' stratified random sampling which takes into account the values of the -#' response variable of the corresponding set of neighbors. The selected -#' local subset is used as local validation subset and the remaining observations -#' are used for fitting a model. This model is used to predict the target -#' response variable values of the local validation subset and the local root -#' mean square error is computed. This process is repeated \mjeqn{m}{m} times and -#' the final local error is computed as the average of the local root mean -#' square error of all the \mjeqn{m}{m} iterations. In the \code{mbl} function -#' \mjeqn{m}{m} is controlled by the \code{number} argument and the size of the -#' subsets is controlled by the \code{p} argument which indicates the -#' percentage of observations to be selected from the subset of nearest neighbours. +#' stratified random sampling that uses the the distribution of +#' the response variable in the corresponding set of neighbors. When +#' \code{p} \mjeqn{>=}{\geqslant} 0.5 (i.e. the number of calibration +#' observations to retain is larger than 50% of the total samples in the neighborhood), +#' the sampling is conducted for selecting the validation samples, and when +#' \code{p} < 0.5 the sampling is conducted for selecting the calibration +#' samples (samples used for model fitting). The model fitted with the selected +#' calibration samples is used to predict the response values of the local +#' validation samples and the local root mean square error is computed. +#' This process is repeated \mjeqn{m}{m} times and the final local +#' error is computed as the average of the local root mean square errors +#' obtained for all the \mjeqn{m}{m} iterations. In the \code{mbl_control} function +#' \mjeqn{m}{m} is controlled by the \code{number} argument and the size of the +#' subsets is controlled by the \code{p} argument which indicates the +#' percentage of observations to be selected from the subset of nearest neighbors. #' The global error of the predictions is computed as the average of the local #' root mean square errors.} #' \item{No validation (\code{"none"}):}{ No validation is carried out. @@ -83,7 +87,7 @@ #' validation(s) will be carried out.} #' } #' @return a \code{list} mirroring the specified parameters -#' @author Leonardo Ramirez-Lopez and Antoine Stevens +#' @author \href{https://orcid.org/0000-0002-5369-5120}{Leonardo Ramirez-Lopez} and Antoine Stevens #' @references #' Ramirez-Lopez, L., Behrens, T., Schmidt, K., Stevens, A., Dematte, J.A.M., #' Scholten, T. 2013a. The spectrum-based learner: A new local approach for diff --git a/R/ortho_diss.R b/R/ortho_diss.R index 0e4b182..1cadb47 100644 --- a/R/ortho_diss.R +++ b/R/ortho_diss.R @@ -13,7 +13,7 @@ #' @usage #' ortho_diss(Xr, Xu = NULL, #' Yr = NULL, -#' pc_selection = list(method = "cumvar", value = 0.99), +#' pc_selection = list(method = "var", value = 0.01), #' diss_method = "pca", #' .local = FALSE, #' pre_k, @@ -61,14 +61,14 @@ #' \item{\code{"cumvar"}:}{ selection of the principal components based #' on a given cumulative amount of explained variance. In this case, #' \code{value} must be a value (larger than 0 and below or equal to 1) -#' indicating the maximum amount of cumulative variance that the -#' retained components should explain.} +#' indicating the minimum amount of cumulative variance that the +#' combination of retained components should explain.} #' #' \item{\code{"var"}:}{ selection of the principal components based #' on a given amount of explained variance. In this case, #' \code{value} must be a value (larger than 0 and below or equal to 1) -#' indicating the minimum amount of variance that a component should -#' explain in order to be retained.} +#' indicating the minimum amount of variance that a single component +#' should explain in order to be retained.} #' #' \item{\code{"manual"}:}{ for manually specifying a fix number of #' principal components. In this case, \code{value} must be a value @@ -76,12 +76,12 @@ #' indicating the minimum amount of variance that a component should #' explain in order to be retained.} #' } -#' The default list passed is \code{list(method = "cumvar", value = 0.99)}. +#' The default list passed is \code{list(method = "var", value = 0.01)}. #' Optionally, the \code{pc_selection} argument admits \code{"opc"} or #' \code{"cumvar"} or \code{"var"} or \code{"manual"} as a single character #' string. In such case, the default \code{"value"} when either \code{"opc"} or #' \code{"manual"} are used is 40. When \code{"cumvar"} is used the default -#' \code{"value"} is set to 0.99 and when \code{"var"} is used the default +#' \code{"value"} is set to 0.99 and when \code{"var"} is used, the default #' \code{"value"} is set to 0.01. #' @param diss_method a character value indicating the type of projection on which #' the dissimilarities must be computed. This argument is equivalent to @@ -131,7 +131,7 @@ #' These neighbors (together with the target observation) are projected #' (from the original data space) onto a (local) orthogonal space (using the #' same parameters specified in the function). In this projected space the -#' Mahalanobis distance between the target observation and the neighbors is +#' Mahalanobis distance between the target observation and its neighbors is #' recomputed. A missing value is assigned to the observations that do not belong to #' this set of neighbors (non-neighbor observations). #' In this case the dissimilarity matrix cannot be considered as a distance @@ -168,12 +168,12 @@ #' between each target observation and its neighbor observations.} #' \item{\code{dissimilarity}}{ the computed dissimilarity matrix. If #' \code{.local = FALSE} a distance matrix. If \code{.local = TRUE} a matrix of -#' class \code{localortho_diss}. In this case, each column represent the dissimilarity +#' class \code{local_ortho_diss}. In this case, each column represent the dissimilarity #' between a target observation and its neighbor observations.} #' \item{\code{projection}}{if \code{return_projection = TRUE}, #' an \code{ortho_projection} object.} #' } -#' @author Leonardo Ramirez-Lopez +#' @author \href{https://orcid.org/0000-0002-5369-5120}{Leonardo Ramirez-Lopez} #' @references #' Ramirez-Lopez, L., Behrens, T., Schmidt, K., Stevens, A., Dematte, J.A.M., #' Scholten, T. 2013a. The spectrum-based learner: A new local approach for @@ -263,7 +263,7 @@ ortho_diss <- function(Xr, Xu = NULL, Yr = NULL, - pc_selection = list(method = "cumvar", value = 0.99), + pc_selection = list(method = "var", value = 0.01), diss_method = "pca", .local = FALSE, pre_k, @@ -492,7 +492,7 @@ ortho_diss <- function(Xr, Xu = NULL, } class(resultsList) <- c("ortho_diss", "list") - class(resultsList$dissimilarity) <- c("localortho_diss", "matrix") + class(resultsList$dissimilarity) <- c("local_ortho_diss", "matrix") return(resultsList) } else { resultsList <- list( diff --git a/R/plot.ortho_projection.R b/R/plot.ortho_projection.R index e1ba89d..b080729 100644 --- a/R/plot.ortho_projection.R +++ b/R/plot.ortho_projection.R @@ -5,29 +5,25 @@ #' #' Plots objects of class \code{ortho_projection} #' @aliases plot.ortho_projection -#' @usage \method{plot}{ortho_projection}(x, ...) +#' @usage \method{plot}{ortho_projection}(x, col = "dodgerblue", ...) #' @param x an object of class \code{ortho_projection} (as returned by \code{ortho_projection}). +#' @param col the color of the plots (default is "dodgerblue") #' @param ... arguments to be passed to methods. #' @author Leonardo Ramirez-Lopez and Antoine Stevens #' @seealso \code{\link{ortho_projection}} #' @importFrom graphics barplot #' @export -plot.ortho_projection <- function(x, ...) { - in.call <- match.call()$col - if (is.null(in.call$col)) { - col <- "dodgerblue" - } - +plot.ortho_projection <- function(x, col = "dodgerblue", ...) { if (x$method == "pls") { x_variance <- x$variance$x_var } else { x_variance <- x$variance } - + if (x$pc_selection$method == "opc") { tpl <- x$opc_evaluation[, c(1, ncol(x$opc_evaluation))] - if ("mean_standardized_rmsd" %in% colnames(tpl)) { + if ("mean_standardized_rmsd_Yr" %in% colnames(tpl)) { ylab <- "mean of the standardized RMSD of all Y variables" } if (colnames(tpl)[2] %in% c("rmsd_Yr", "rmsd")) { @@ -37,23 +33,24 @@ plot.ortho_projection <- function(x, ...) { ylab <- "kappa index" } plot(tpl, - type = "b", - ylab = ylab, pch = 1, col = col, ... + type = "p", + ylab = ylab, pch = 1, col = col, ... ) - } - if (x$pc_selection$method == "cumvar") { - barplot(x_variance[grep("cumulative", rownames(x_variance)), ], - horiz = F, - names.arg = colnames(x_variance), ylim = c(0, 1), - ylab = "Explained variance (cummulative)", col = col, ... - ) - } - if (x$pc_selection$method %in% c("cumvar", "manual")) { - x$variance + grid() + segments(tpl[,1], 0, tpl[,1], tpl[,2], col = col) + } else{ + o_mfrow <- par()$mfrow + par(mfrow = c(1, 2)) barplot(x_variance[grep("^explained_var", rownames(x_variance)), ], - horiz = F, - names.arg = colnames(x_variance), ylim = c(0, 1), - ylab = "Explained variance", col = col, ... + horiz = F, + names.arg = colnames(x_variance), ylim = c(0, 1), + ylab = "Explained variance", col = col, ... + ) + barplot(x_variance[grep("cumulative", rownames(x_variance)), ], + horiz = F, + names.arg = colnames(x_variance), ylim = c(0, 1), + ylab = "Explained variance (cumulative)", col = col, ... ) + par(mfrow = o_mfrow) } } diff --git a/R/print.local_ortho_diss.R b/R/print.local_ortho_diss.R index 0b94fb4..84a5b39 100644 --- a/R/print.local_ortho_diss.R +++ b/R/print.local_ortho_diss.R @@ -1,7 +1,7 @@ #' @title Print method for an object of class \code{ortho_diss} #' @description Prints the content of an object of class \code{ortho_diss} #' @usage \method{print}{local_ortho_diss}(x, ...) -#' @param x an object of class \code{localortho_diss} (returned by +#' @param x an object of class \code{local_ortho_diss} (returned by #' \code{ortho_diss} when it uses \code{.local = TRUE}). #' @param ... arguments to be passed to methods (not yet functional). #' @author Leonardo Ramirez-Lopez and Antoine Stevens @@ -54,7 +54,7 @@ print.local_ortho_diss <- function(x, ...) { #' @param columns the indices of the columns #' @param drop drop argument #' @param ... not used -#' @description prints the subsets of localortho_diss objects +#' @description prints the subsets of local_ortho_diss objects #' @keywords internal #' @export "[.local_ortho_diss" <- function(x, rows, columns, drop = FALSE, ...) { @@ -65,7 +65,7 @@ print.local_ortho_diss <- function(x, ...) { class(object) <- NULL obj <- object[rows, columns, drop = drop] if (!drop) { - class(obj) <- "localortho_diss" + class(obj) <- "local_ortho_diss" } return(obj) } diff --git a/R/resemble.R b/R/resemble.R index 7aa1f6b..5ecdd74 100644 --- a/R/resemble.R +++ b/R/resemble.R @@ -10,7 +10,7 @@ #' @importFrom lifecycle deprecate_soft ## usethis namespace: end #' @importFrom magrittr %>% -#' @importFrom dplyr mutate rename if_else select +#' @importFrom dplyr if_else select #' @importFrom utils setTxtProgressBar txtProgressBar #' @importFrom stats model.frame model.matrix model.extract na.fail sd reshape #' @description @@ -27,7 +27,10 @@ #' modeling complex spectral spectra (e.g. NIR, IR). #' The package includes functions for dimensionality reduction, #' computing spectral dissimilarity matrices, nearest neighbor search, -#' and modeling spectral data using memory-based learning. +#' and modeling spectral data using memory-based learning. +#' +#' Development versions can be found in the github repository of the package +#' at \href{https://github.com/l-ramirez-lopez/resemble}{https://github.com/l-ramirez-lopez/resemble}. #' #' The functions available for dimensionality reduction are: #' \itemize{ @@ -66,18 +69,19 @@ #' @name resemble-package #' @aliases resemble-package resemble #' @title Overview of the functions in the resemble package -#' @author -#' Leonardo Ramirez-Lopez +#' @author +#' +#' \href{https://orcid.org/0000-0002-5369-5120}{Leonardo Ramirez-Lopez} [aut, cre] #' -#' Antoine Stevens +#' \href{https://orcid.org/0000-0002-1588-7519}{Antoine Stevens} [ctb] #' -#' Raphael Viscarra Rossel +#' \href{https://orcid.org/0000-0003-1540-4748}{Raphael Viscarra Rossel} [ctb] #' -#' Craig Lobsey +#' \href{https://orcid.org/0000-0001-5416-8640}{Craig Lobsey} [ctb] #' -#' Alex Wadoux +#' \href{https://orcid.org/0000-0001-7325-9716}{Alex Wadoux} [ctb] #' -#' Timo Breure +#' \href{https://orcid.org/0000-0001-5695-8064}{Timo Breure} [ctb] ###################################################################### # resemble # Copyright (C) 2020 Leonardo Ramirez-Lopez @@ -92,6 +96,4 @@ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. ###################################################################### - - NULL diff --git a/R/rslocal_helpers.R b/R/rslocal_helpers.R deleted file mode 100644 index 1f872a7..0000000 --- a/R/rslocal_helpers.R +++ /dev/null @@ -1,192 +0,0 @@ -#' @title iterates over elements to print -#' @description internal. used for printing -#' @keywords internal -cat_iter <- function(x) { - iter_x <- iter(x, by = "cell", recycle = TRUE) - - next_e <- function() { - nextElem(iter_x) - } - obj <- next_e - class(obj) <- c("isubset", "abstractiter", "iter") - obj -} - -#' @title fits pls models at each r-local iteration -#' @description internal -#' @keywords internal -biter <- function(itersubs, - Xu, - Yu, - iter_sequence, - optimization, - ncomp, - tune, - p, - number, - scale, - max_iter, - tol, - allow_parallel) { - "%mydo%" <- get("%do%") - if (allow_parallel & getDoParRegistered()) { - "%mydo%" <- get("%dopar%") - } - - its <- NULL - rdf <- foreach( - idx = iter_sequence, - its = itersubs - ) %mydo% { - # # - # # Step 2 - sample a training data set of size k from K without replacement - # # - # selected.idx <- sample(k_idx, - # size = k, - # replace = FALSE) - # - # # retrieve the spectra and dependent variable from the SL for the training - # set selection - # X <- Xr[selected.idx,] - # Y <- Yr[selected.idx] - - # dataset <- cbind(X, Y) - - # - # Step 3 calibrate a PLS model using the selected SL samples - # - - if (tune) { - # perform a leave-group-out cross.validation of the selected SL (k) - # observations to determine a suitable number of pls factors - plsv <- pls_cv( - x = its$x, - y = its$y, - ncomp = ncomp, - method = "pls", - center = TRUE, - scale = scale, - min_component = 1, - p = p, - number = number, - group = its$group, - retrieve = FALSE, - tune = TRUE, - max_iter = max_iter, - tol = tol - ) - - selected_pls_c <- which.min(plsv$cv_results$rmse_cv)[1] - } else { - selected_pls_c <- ncomp - } - - # build the model - mod <- opls_get_basics( - X = its$x, - Y = its$y, - ncomp = selected_pls_c, - scale = scale, - maxiter = max_iter, - tol = tol - ) - - if (optimization == "response") { - # rs <- Xu - (Xu %*% mod$projection_mat %*% mod$X_loadings) - - y_pred <- predict_opls( - bo = mod$bo, - b = mod$coefficients, - ncomp = selected_pls_c, - newdata = Xu, - scale = scale, - Xscale = mod$transf$Xscale - ) - y_pred <- y_pred[, selected_pls_c] - - # - # Step 4 - validate on the 'm' site specific observations - # - # y_pred <- as.numeric(predict(mod, Xu, ncomp=selected_pls_c)) - - # It is possible we get NA predictions, catch this case and ignore this - # observation test - if (any(is.na(y_pred))) { - - } else { - # rmse <- sqrt(sum((y_pred - Yu)^2)/(length(Yu) - 1)) - # rmse <- sqrt(sum((y_pred - Yu)^2)/(length(Yu))) - rmse <- sqrt(mean((y_pred - Yu)^2)) - } - } - - if (optimization == "reconstruction") { - ## this must be implemented in Rcpp - # xpred <- (Xu %*% mod$projection_mat) %*% mod$X_loadings - # rmse <- sqrt(mean((Xu - xpred)^2)) - rmse <- reconstruction_error(Xu, mod$projection_mat, mod$X_loadings)[[1]] - } - - # combine the validation results and the observations that were selected for this - # model - return(c( - rmse_subset = rmse, - sample = its$idx - )) - } - - # compile a data frame of all results of the sampling iterations - rdf <- do.call("rbind", rdf) - rmsesubset <- rdf[, "rmse_subset"] - rdf <- rdf[, !colnames(rdf) %in% "rmse_subset"] - return(list( - rmsesubset = rmsesubset, - sampleidx = rdf - )) -} - -#' @title iterator that subsets a matrix based on input indices for pls modeling -#' at each rs-local iteration -#' @description internal -#' @param x an input matrix of predictors -#' @param y a response variable -#' @param group a variable giving the groups/calsses to which the observations -#' belong to (used for avoiding pseudo-replication during validation) -#' @param indx the indices to retrieve -#' @return an object of \code{class} iterator giving a \code{list} with (a) a -#' subset of the input matrix with rows re-arranged according to `indx` -#' @details this is designed to be used within (parallel) loops to avoid sending -#' entire matrices to each core. It just sends the subset of that is required. -#' @author Leonardo Ramirez-Lopez -#' @keywords internal -ithrssubsets <- function(x, - y, - group = NULL, - indx) { - it_indx <- iter(indx, by = "column") - - if (is.null(group)) { - nextEl <- function() { - ss <- nextElem(it_indx) - list( - x = x[ss, , drop = FALSE], - y = y[ss, , drop = FALSE], - group = NULL, - idx = ss - ) - } - } else { - nextEl <- function() { - ss <- nextElem(it_indx) - list( - x = x[ss, , drop = FALSE], - y = y[ss, , drop = FALSE], - group = factor(group[ss]), - idx = ss - ) - } - } - obj <- list(nextElem = nextEl) - class(obj) <- c("isubset", "abstractiter", "iter") - obj -} diff --git a/R/sample_stratified.R b/R/sample_stratified.R new file mode 100644 index 0000000..51327ba --- /dev/null +++ b/R/sample_stratified.R @@ -0,0 +1,404 @@ +#' @title A function to create calibration and validation sample sets for +#' leave-group-out and bootstraping cross-validation +#' @description for internal use only! This is stratified sampling based on the +#' values of a continuous response variable (y). If group is provided, the +#' sampling is done based on the groups and the average of y per group. This +#' function is used to create calibration and validation groups for +#' leave-group-out cross-validations (or +#' leave-group-of-groups-out cross-validation if group argument is provided) +#' and/or bootstrapping. +#' @param y a matrix of one column with the response variable. +#' @param p the percentage of samples (or groups if group argument is used) to +#' retain in the validation_indices set +#' @param number the number of sample groups to be crated +#' @param group the labels for each sample in \code{y} indicating the group each +#' observation belongs to. +#' @param replacement A logical indicating sample replacements for the +#' calibration set are required. +#' @return a list with two matrices (\code{hold_in} and +#' \code{hold_out}) giving the indices of the observations in each +#' column. The number of columns represents the number of sampling repetitions. +#' @keywords internal + +sample_stratified <- function(y, p, number, group = NULL, replacement = FALSE) { + + ## If the percentage of samples to build the hold_in subset is below 50% of + ## the total number of samples, the selection is based on the number of samples + ## to retain. + ## On the other hand, if the percentage of samples to build the hold_in subset + ## is equal or above 50% of the total number of samples, the selection is based + ## on the number of samples to exclude. + if (p < 0.5) { + p_to_sample <- p + do_sampling_for <- "calibration" + } else { + p_to_sample <- 1 - p + do_sampling_for <- "validation" + } + + if (is.null(group)) { + nv <- floor(p_to_sample * nrow(y)) + nv <- ifelse(nv < 1, 1, nv) + + if (p >= 0.5) { + n_val <- nv + if (replacement) { + n_cal <- nrow(y) + } else { + n_cal <- nrow(y) - nv + } + } else { + n_val <- nrow(y) - nv + if (replacement) { + n_cal <- nrow(y) + } else { + n_cal <- nrow(y) - n_val + } + } + + strata_category <- optim_sample_strata( + y = y, + n = nv + ) + + calibration_indices <- matrix(NA, n_cal, number) + validation_indices <- matrix(NA, n_val, number) + + colnames(calibration_indices) <- colnames(validation_indices) <- paste0( + "Resample_", + seq(1:number) + ) + rownames(calibration_indices) <- paste0("index_", seq(1:n_cal)) + rownames(validation_indices) <- paste0("index_", seq(1:nrow(validation_indices))) + + indcs <- 1:nrow(y) + for (jj in 1:number) { + strata_samples <- get_samples_from_strata( + original_order = strata_category$sample_strata$original_order, + strata = strata_category$sample_strata$strata, + samples_per_strata = strata_category$samples_to_get, + replacement = replacement, + sampling_for = do_sampling_for + ) + + calibration_indices[, jj] <- strata_samples$calibration + validation_indices[, jj] <- strata_samples$validation + } + } else { + y_groups <- data.table( + y = y, + group = factor(group), + original_order = 1:length(y) + ) + gr_levels <- levels(y_groups$group) + n_levels <- nlevels(y_groups$group) + # Compute means per group and make the stratified sampling based on that + # vector of means + aggregated_y <- tapply( + X = y_groups$y, + INDEX = y_groups$group, + FUN = mean + ) + + nv <- floor(p_to_sample * n_levels) + nv <- ifelse(nv < 1, 1, nv) + + if (p >= 0.5) { + n_val <- nv + if (replacement) { + n_cal <- nrow(y) + } else { + n_cal <- nrow(y) - nv + } + } else { + n_val <- nrow(y) - nv + if (replacement) { + n_cal <- nrow(y) + } else { + n_cal <- nrow(y) - n_val + } + } + + strata_category <- optim_sample_strata( + y = aggregated_y, + n = nv + ) + strata_category$strata <- paste0("l_", strata_category$strata) + + calibration_indices <- matrix(0, nv, number) + colnames(calibration_indices) <- paste0("Resample_", seq(1:number)) + rownames(calibration_indices) <- paste0("index_", seq(1:nv)) + + calibration_groups_indices <- validation_groups_indices <- NULL + + for (jj in 1:number) { + + strata_samples <- get_samples_from_strata( + original_order = strata_category$sample_strata$original_order, + strata = strata_category$sample_strata$strata, + samples_per_strata = strata_category$samples_to_get, + sampling_for = do_sampling_for, + replacement = replacement + ) + + if (replacement) { + sel_sample_indices <- lapply(1:length(strata_samples$calibration), + FUN = function(ith, full_groups, cal_groups) { + ## this equivalent to extract from + ## y_groups$original_order + which(full_groups %in% cal_groups[ith]) + }, + full_groups = y_groups$group, + cal_groups = gr_levels[strata_samples$calibration] + ) + sel_sample_indices <- do.call("c", sel_sample_indices) + } else { + sel_sample_indices <- y_groups$original_order[y_groups$group %in% gr_levels[strata_samples$calibration]] + } + + calibration_groups_indices[[jj]] <- sel_sample_indices + validation_groups_indices[[jj]] <- y_groups$original_order[y_groups$group %in% gr_levels[strata_samples$validation]] + } + lengths_list_calibration_indices <- lengths(calibration_groups_indices) + lengths_list_validation_indices <- lengths(validation_groups_indices) + calibration_indices <- matrix(NA, max(lengths_list_calibration_indices), number) + validation_indices <- matrix(NA, max(lengths_list_validation_indices), number) + colnames(calibration_indices) <- colnames(validation_indices) <- paste0( + "Resample_", + seq(1:number) + ) + rownames(calibration_indices) <- paste0("index_", 1:nrow(calibration_indices)) + rownames(validation_indices) <- paste0("index_", 1:nrow(validation_indices)) + indcs <- 1:nrow(y) + for (jj in 1:number) { + ## ramdomly select the replacements for the missing indexes + ## replacements are necessary! + n_cal_missing <- max(lengths_list_calibration_indices) - lengths_list_calibration_indices[jj] + jj_add_list_calibration_indices <- sample(calibration_groups_indices[[jj]], + size = n_cal_missing, + replace = TRUE + ) + + n_val_missing <- max(lengths_list_validation_indices) - lengths_list_validation_indices[jj] + jj_add_list_validation_indices <- sample(validation_groups_indices[[jj]], + size = n_val_missing, + replace = TRUE + ) + + calibration_indices[1:lengths_list_calibration_indices[jj], jj] <- calibration_groups_indices[[jj]] + validation_indices[1:lengths_list_validation_indices[jj], jj] <- validation_groups_indices[[jj]] + calibration_indices[-(1:lengths_list_calibration_indices[jj]), jj] <- jj_add_list_calibration_indices + validation_indices[-(1:lengths_list_validation_indices[jj]), jj] <- jj_add_list_validation_indices + } + } + list( + hold_in = calibration_indices, + hold_out = validation_indices + ) +} + +#' @title A function to assign values to sample distribution strata +#' @description for internal use only! This function takes a continuous variable, +#' creates n strata based on its distribution and assigns the corresponding starta +#' to every value. +#' @param y a matrix of one column with the response variable. +#' @param n the number of strata. +#' @return a data table with the input \code{y} and the corresponding strata to +#' every value. +#' @keywords internal +get_sample_strata <- function(y, n) { + y_strata <- unique(quantile(y, + probs = seq(0, 1, length = (n + 1)), + names = FALSE + )) + + + strata_labels <- 1:(length(y_strata) - 1) + y_cuts <- cut(y, + breaks = y_strata, + labels = strata_labels, + include.lowest = TRUE + ) + + strata_category <- data.table( + original_order = 1:length(y), + strata = y_cuts + ) +} + + +optim_sample_strata <- function(y, n) { + sample_strata <- get_sample_strata(y, n) + n_strata <- length(unique(sample_strata$strata)) + iter <- 1 + table_strata <- table(sample_strata$strata) + if (n_strata < n) { + keep_going <- TRUE + new_n <- n + new_min_samples_per_strata <- 2 ## at least 2 samples per strata + while (keep_going) { + n_vec <- 1:new_n + new_min_samples_per_strata <- new_min_samples_per_strata + 2 + s_missing <- n_vec[-as.numeric(names(table_strata))] + v_missing <- rep(0, length(s_missing)) + names(v_missing) <- s_missing + table_strata <- c(table_strata, v_missing) + + new_n <- ceiling(n / (iter + 1)) # ???????????????? + + sample_strata <- get_sample_strata(y, new_n) + table_strata <- table(sample_strata$strata) + + condition_1 <- new_min_samples_per_strata < min(table_strata) + condition_2 <- length(table(sample_strata$strata)) == new_n + condition_3 <- new_min_samples_per_strata >= n + + + if ((condition_1 & condition_2) | condition_3) { + break + } + iter <- iter + 1 + } + + samples_to_get_no_replacement <- rep( + new_min_samples_per_strata / 2, + nlevels(sample_strata$strata) + ) + + + samples_to_get <- data.table( + strata = levels(sample_strata$strata), + samples_to_get = samples_to_get_no_replacement + ) + total_samples <- new_n * (iter + 1) + to_remove <- total_samples - n + if (to_remove > 0) { + highest_freqs <- names(sort(table_strata, decreasing = TRUE)[1:to_remove]) + samples_to_get$samples_to_get[samples_to_get$strata %in% highest_freqs] <- + samples_to_get$samples_to_get[samples_to_get$strata %in% highest_freqs] - 1 + } + } else { + samples_to_get <- data.table( + strata = levels(sample_strata$strata), + samples_to_get = 1 + ) + } + + list( + sample_strata = sample_strata, + samples_to_get = samples_to_get + ) +} + +#' @title A function for stratified calibration/validation sampling +#' @description for internal use only! This function selects samples +#' based on provided strata. +#' @param original_order a matrix of one column with the response variable. +#' @param starta the number of strata. +#' @param sampling_for sampling to select the calibration samples ("calibration") +#' or sampling to select the validation samples ("validation"). +#' @param replacement logical indicating if sampling with replacement must be +#' done. +#' @return a list with the indices of the calibration and validation samples. +#' @keywords internal +get_samples_from_strata <- function(original_order, + strata, + samples_per_strata, + sampling_for = c("calibration", "validation"), + replacement = FALSE) { + n_samples <- sum(samples_per_strata$samples_to_get) + with_replacement <- FALSE + if (replacement & sampling_for == "validation") { + samples_per_strata$samples_to_get <- 2 * samples_per_strata$samples_to_get + with_replacement <- TRUE + } + + get_random_sample <- function(x, ns) { + if (length(x) == 1) { + # this is required to keep the name of the + # strata, otherwise it fails + x <- c(x, x) + } + sample(x, size = ns) + } + + ## for selecting the replacement samples in cases where a strata has only one + ## sample, the replacement sample is randomly selected from the data + max_samples <- max(samples_per_strata$samples_to_get) + vec_samples <- rep(NA, max_samples) + + strata_samples <- lapply(levels(strata), + FUN = function(strata, + original_order, + samples_per_strata, + vec_samples, + replacement, + ii) { + ith_n <- samples_per_strata$samples_to_get[samples_per_strata$strata == ii] + ith_set <- original_order[which(strata == ii)] + ith_sel <- get_random_sample(ith_set, ith_n) + if (replacement) { + ln <- (length(ith_sel) / 2) + vec_samples[1:ln] <- ith_sel[1:ln] + vec_samples[(length(vec_samples) - ln + 1):length(vec_samples)] <- ith_sel[-(1:ln)] + } else { + vec_samples[1:length(ith_sel)] <- ith_sel + } + + vec_samples + }, + strata = strata, + original_order = original_order, + samples_per_strata = samples_per_strata, + vec_samples = vec_samples, + replacement = with_replacement + ) + + strata_samples <- do.call("rbind", strata_samples) + if (replacement & sampling_for == "validation") { + col_s <- 1:(ncol(strata_samples) / 2) + col_replacement <- -col_s + strata_samples <- cbind( + sort(strata_samples[, col_s]), + sort(strata_samples[, col_replacement]) + ) + } else { + strata_samples <- as.matrix(sort(strata_samples)) + } + + + if (sampling_for == "validation") { + if (replacement) { + unique_sample_strata <- levels(strata)[strata_samples[, 1] == strata_samples[, 2]] + if (length(unique_sample_strata > 0)) { + solve_replacement <- sample(original_order[-strata_samples[, 2]], length(unique_sample_strata)) + strata_samples[unique_sample_strata, 2] <- solve_replacement + } + + replacement_indices <- strata_samples[, 2] + } else { + replacement_indices <- NULL + } + + keep <- original_order[!original_order %in% strata_samples[, 1]] + exclude <- as.vector(sort(strata_samples[, 1])) + } + + if (sampling_for == "calibration") { + keep <- strata_samples[, 1] + exclude <- original_order[!original_order %in% keep] + if (replacement) { + replacement_indices <- sample(keep, length(original_order) - length(keep), replace = TRUE) + } else { + replacement_indices <- NULL + } + } + + keep <- sort(as.vector(c(keep, replacement_indices))) + + strata_samples <- list( + calibration = keep, + validation = exclude + ) + strata_samples +} diff --git a/R/search_neighbors.R b/R/search_neighbors.R index 9db1828..8bd603e 100644 --- a/R/search_neighbors.R +++ b/R/search_neighbors.R @@ -11,7 +11,7 @@ #' search_neighbors(Xr, Xu, diss_method = c("pca", "pca.nipals", "pls", "cor", #' "euclid", "cosine", "sid"), #' Yr = NULL, k, k_diss, k_range, spike = NULL, -#' pc_selection = list("cumvar", 0.99), +#' pc_selection = list("var", 0.01), #' return_projection = FALSE, return_dissimilarity = FALSE, #' ws = NULL, #' center = TRUE, scale = FALSE, @@ -84,7 +84,7 @@ #' components) and \code{value} (a numerical value that complements the selected #' method). The methods available are: #' \itemize{ -#' \item{\code{"opc"}:} {optimized principal component selection based on +#' \item{\code{"opc"}:} { optimized principal component selection based on #' Ramirez-Lopez et al. (2013a, 2013b). The optimal number of components #' (of set of observations) is the one for which its distance matrix #' minimizes the differences between the \code{Yr} value of each @@ -94,17 +94,17 @@ #' number of principal components to be tested. See the #' \code{\link{ortho_projection}} function for more details.} #' -#' \item{\code{"cumvar"}:}{selection of the principal components based +#' \item{\code{"cumvar"}:}{ selection of the principal components based #' on a given cumulative amount of explained variance. In this case, #' \code{value} must be a value (larger than 0 and below or equal to 1) -#' indicating the maximum amount of cumulative variance that the -#' retained components should explain.} +#' indicating the minimum amount of cumulative variance that the +#' combination of retained components should explain.} #' -#' \item{\code{"var"}:}{selection of the principal components based +#' \item{\code{"var"}:}{ selection of the principal components based #' on a given amount of explained variance. In this case, #' \code{value} must be a value (larger than 0 and below or equal to 1) -#' indicating the minimum amount of variance that a component should -#' explain in order to be retained.} +#' indicating the minimum amount of variance that a single component +#' should explain in order to be retained.} #' #' \item{\code{"manual"}:}{ for manually specifying a fix number of #' principal components. In this case, \code{value} must be a value @@ -112,12 +112,12 @@ #' indicating the minimum amount of variance that a component should #' explain in order to be retained.} #' } -#' The default list passed is \code{list(method = "cumvar", value = 0.99)}. +#' The default list passed is \code{list(method = "var", value = 0.01)}. #' Optionally, the \code{pc_selection} argument admits \code{"opc"} or #' \code{"cumvar"} or \code{"var"} or \code{"manual"} as a single character #' string. In such a case the default \code{"value"} when either \code{"opc"} or #' \code{"manual"} are used is 40. When \code{"cumvar"} is used the default -#' \code{"value"} is set to 0.99 and when \code{"var"} is used the default +#' \code{"value"} is set to 0.99 and when \code{"var"} is used, the default #' \code{"value"} is set to 0.01. #' @param return_projection a logical indicating if the projection(s) must be #' returned. Projections are used if the \code{\link{ortho_diss}} methods are @@ -155,11 +155,11 @@ #' \code{\link{mbl}} function. #' #' This function uses the \code{\link{dissimilarity}} fucntion to compute the -#' dissimilarities between code{Xr} and \code{Xu}. Arguments to +#' dissimilarities between \code{Xr} and \code{Xu}. Arguments to #' \code{\link{dissimilarity}} as well as further arguments to the functions #' used inside \code{\link{dissimilarity}} (i.e. \code{\link{ortho_diss}} #' \code{\link{cor_diss}} \code{\link{f_diss}} \code{\link{sid}}) can be passed to -#' those functions by using \code{...}. +#' those functions as additional arguments (i.e. \code{...}). #' @return a \code{list} containing the following elements: #' \itemize{ #' \item{\code{neighbors_diss}}{ a matrix of the \code{Xr} dissimilarity socres @@ -193,7 +193,7 @@ #' neighborhoods (see \code{\link{ortho_diss}} function for further #' details).} #' } -#' @author Leonardo Ramirez-Lopez +#' @author \href{https://orcid.org/0000-0002-5369-5120}{Leonardo Ramirez-Lopez} #' @references #' Ramirez-Lopez, L., Behrens, T., Schmidt, K., Stevens, A., Dematte, J.A.M., #' Scholten, T. 2013a. The spectrum-based learner: A new local approach for @@ -293,26 +293,24 @@ ## - scaled renamed to scale ## - pcMethod and cores are deprecated -search_neighbors <- function(Xr, Xu, diss_method = c( - "pca", - "pca.nipals", - "pls", - "cor", - "euclid", - "cosine", - "sid" - ), +search_neighbors <- function(Xr, Xu, diss_method = c("pca", + "pca.nipals", + "pls", + "cor", + "euclid", + "cosine", + "sid"), Yr = NULL, k, k_diss, k_range, spike = NULL, - pc_selection = list("cumvar", 0.99), + pc_selection = list("var", 0.01), return_projection = FALSE, return_dissimilarity = FALSE, ws = NULL, center = TRUE, scale = FALSE, documentation = character(), ...) { - - + + # Sanity checks match.arg(diss_method, c( "pca", @@ -323,27 +321,27 @@ search_neighbors <- function(Xr, Xu, diss_method = c( "cosine", "sid" )) - + if (missing(k)) { k <- NULL } - + if (missing(k_diss)) { k_diss <- NULL } - + if (missing(k_range)) { k_range <- NULL } - + if (!is.logical(center)) { stop("'center' argument must be logical") } - + if (!is.logical(scale)) { stop("'scale' argument must be logical") } - + if (diss_method == "cor") { if (!is.null(ws)) { if (ws < 3 | ws > (ncol(Xr) - 1) | length(ws) != 1 | (ws %% 2) == 0) { @@ -354,18 +352,18 @@ search_neighbors <- function(Xr, Xu, diss_method = c( } } } - + if (!is.null(k) & !is.null(k_diss)) { # if k and k_diss are not called here, errors are thrown during checks k k_diss stop("Only one of k or k_diss can be specified") } - + if (is.null(k) & is.null(k_diss)) { stop("Either k or k_diss must be specified") } - + if (!is.null(k)) { k <- as.integer(k) if (k < 1) { @@ -379,7 +377,7 @@ search_neighbors <- function(Xr, Xu, diss_method = c( } kk <- k } - + if (!is.null(k_diss)) { # if k_diss is not called here, errors are thrown during checks k_diss @@ -425,7 +423,7 @@ search_neighbors <- function(Xr, Xu, diss_method = c( } } } - + if (!is.null(spike)) { if (!is.vector(spike)) { stop("spike must be a vector of integers") @@ -453,19 +451,19 @@ search_neighbors <- function(Xr, Xu, diss_method = c( scale = scale, ... ) - + results <- diss_to_neighbors(dsm$dissimilarity, - k = k, k_diss = k_diss, k_range = k_range, - spike = spike, - return_dissimilarity = return_dissimilarity + k = k, k_diss = k_diss, k_range = k_range, + spike = spike, + return_dissimilarity = return_dissimilarity ) - + if (return_projection & diss_method %in% c("pca", "pca.nipals", "pls")) { results$projection <- dsm$projection } if ("gh" %in% names(input_dots)) { results$gh <- dsm$gh } - + results } diff --git a/R/sid.R b/R/sid.R index ef05d15..ee6c154 100644 --- a/R/sid.R +++ b/R/sid.R @@ -5,7 +5,7 @@ #' \lifecycle{experimental} #' #' \loadmathjax -#' This function computes the spectral information divergence (distance) between +#' This function computes the spectral information divergence/dissimilarity between #' spectra based on the kullback-leibler divergence algorithm (see details). #' @usage #' sid(Xr, Xu = NULL, @@ -69,9 +69,9 @@ #' by each spectrum. The SID between two spectra (\mjeqn{x_{i}}{x_i} and #' \mjeqn{x_{j}}{x_j}) is computed as follows: #' -#' \mjdeqn{SID(x_{i},x_{j}) = KL(x_{i} \left |\right | x_{j}) + KL(x_{j} \left |\right | x_{i})}{SID(x_i,x_j) = KL(x_i |\ | x_j) + KL(x_j |\ | x_i)} +#' \mjdeqn{sid(x_{i},x_{j}) = KL(x_{i} \left |\right | x_{j}) + KL(x_{j} \left |\right | x_{i})}{sid(x_i,x_j) = KL(x_i |\ | x_j) + KL(x_j |\ | x_i)} #' -#' \mjdeqn{SID(x_{i},x_{j}) = \sum_{l=1}^{k} p_l \ log(\frac{p_l}{q_l}) + \sum_{l=1}^{k} q_l \ log(\frac{q_l}{p_l})}{SID(x_i,x_j) = \sum_{l=1}^{k} p_l \ log(p_l/q_l) + \sum_{l=1}^{k} q_l \ log(q_l/p_l)} +#' \mjdeqn{sid(x_{i},x_{j}) = \sum_{l=1}^{k} p_l \ log(\frac{p_l}{q_l}) + \sum_{l=1}^{k} q_l \ log(\frac{q_l}{p_l})}{sid(x_i,x_j) = \sum_{l=1}^{k} p_l \ log(p_l/q_l) + \sum_{l=1}^{k} q_l \ log(q_l/p_l)} #' #' where \mjeqn{k}{k} represents the number of variables or spectral features, #' \mjeqn{p}{p} and \mjeqn{q}{q} are the probability vectors of \mjeqn{x_{i}}{x_i} and diff --git a/R/sim_eval.R b/R/sim_eval.R index d3bcec3..a271042 100644 --- a/R/sim_eval.R +++ b/R/sim_eval.R @@ -34,13 +34,17 @@ #' is used for assessing the similarity between the observations and their #' corresponding most similar observations in terms of the side information #' provided. It is computed as follows: +#' +#' \mjdeqn{j(i) = NN(xr_i, Xr^{\{-i\}})}{j(i) = NN(xr_i, Xr^{\{-i\}})} +#' \mjdeqn{RMSD = \sqrt{\frac{1}{m} \sum_{i=1}^n {(y_i - y_{j(i)})^2}}}{RMSD = \sqrt{1/n sum_{i=1}^m (y_i - y_{j(i)})^2}} #' -#' \mjdeqn{RMSD = \sqrt{\frac{1}{n} \sum_{i=1}^n {(y_i - \ddot{y}_i)^2}}}{RMSD = \sqrt{1/n sum_{i=1}^n (y_i - ddot y_i)^2}} -#' -#' where \mjeqn{y_{i}}{y_i} is the value of the side variable of the \mjeqn{i}{i}th -#' observation, \mjeqn{\ddot{y}_i}{ddot y_i} is the value of the side variable -#' of the nearest neighbor of the \mjeqn{i}{i}th observation and \mjeqn{n}{n} -#' is the total number of observations. +#' where \mjeqn{NN(xr_i, Xr^{-i})}{NN(xr_i, Xr^{-i})} represents a function to +#' obtain the index of the nearest neighbor observation found in \mjeqn{Xr}{Xr} +#' (excluding the \mjeqn{i}{i}th observation) for \mjeqn{xr_i}{xr_i}, +#' \mjeqn{y_{i}}{y_i} is the value of the side variable of the \mjeqn{i}{i}th +#' observation, \mjeqn{y_{j(i)}}{y_{j(i)}} is the value of the side variable of +#' the nearest neighbor of the \mjeqn{i}{i}th observation and \mjeqn{m}{m} is +#' the total number of observations. #' #' If \code{side_info} is a factor the kappa index (\mjeqn{\kappa}{kappa}) is #' used instead the RMSD. It is computed as follows: @@ -68,7 +72,7 @@ #' informative values of the corresponding nearest neighbors in the second half #' of the columns.} #' } -#' @author Leonardo Ramirez-Lopez +#' @author \href{https://orcid.org/0000-0002-5369-5120}{Leonardo Ramirez-Lopez} #' @references #' Ramirez-Lopez, L., Behrens, T., Schmidt, K., Stevens, A., Dematte, J.A.M., #' Scholten, T. 2013a. The spectrum-based learner: A new local approach for diff --git a/man/biter.Rd b/man/biter.Rd deleted file mode 100644 index 3de6ecb..0000000 --- a/man/biter.Rd +++ /dev/null @@ -1,26 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/rslocal_helpers.R -\name{biter} -\alias{biter} -\title{fits pls models at each r-local iteration} -\usage{ -biter( - itersubs, - Xu, - Yu, - iter_sequence, - optimization, - ncomp, - tune, - p, - number, - scale, - max_iter, - tol, - allow_parallel -) -} -\description{ -internal -} -\keyword{internal} diff --git a/man/cat_iter.Rd b/man/cat_iter.Rd deleted file mode 100644 index 9713416..0000000 --- a/man/cat_iter.Rd +++ /dev/null @@ -1,12 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/rslocal_helpers.R -\name{cat_iter} -\alias{cat_iter} -\title{iterates over elements to print} -\usage{ -cat_iter(x) -} -\description{ -internal. used for printing -} -\keyword{internal} diff --git a/man/cor_diss.Rd b/man/cor_diss.Rd index a3b0afc..83d5934 100644 --- a/man/cor_diss.Rd +++ b/man/cor_diss.Rd @@ -36,17 +36,19 @@ a matrix of the computed dissimilarities. Computes correlation and moving correlation dissimilarity matrices. } \details{ -The correlation dissimilarity \mjeqn{cd}{cd} between two observations -\mjeqn{x_i}{x_i} and \mjeqn{x_j}{x_j} is computed as follows: +The correlation dissimilarity \mjeqn{d}{d} between two observations +\mjeqn{x_i}{x_i} and \mjeqn{x_j}{x_j} is based on the Perason's +correlation coefficient (\mjeqn{\rho}{\rho}) and it can be computed as +follows: -\mjdeqn{cd(x_i, x_j) = \frac{1}{2}(1 - cor(x_i, x_j))}{cd(x_i, x_j) = 1/2 (1 - cor (x_i, x_j))} +\mjdeqn{d(x_i, x_j) = \frac{1}{2}(1 - \rho(x_i, x_j))}{d(x_i, x_j) = 1/2 (1 - \rho(x_i, x_j))} -The avobe formlula is used when \code{ws = NULL}. +The above formula is used when \code{ws = NULL}. On the other hand (when \code{ws != NULL}) the moving correlation -dissimilarity \mjeqn{mcd}{mcd} between two observations \mjeqn{x_i}{x_i} and \mjeqn{x_j}{x_j} +dissimilarity between two observations \mjeqn{x_i}{x_i} and \mjeqn{x_j}{x_j} is computed as follows: -\mjdeqn{mcd(x_i, x_j) = \frac{1}{2 ws}\sum_{k=1}^{p-ws}(1 - cor(x_{i,(k:k+ws)}, x_{j,(k:k+ws)}))}{mcd(x_i, x_j) = 1/(2 ws)\sum_(k=1)^{p-ws}(1 - cor(x_(i,k:k+ws), x_(j,k:k+ws)))} +\mjdeqn{d(x_i, x_j; ws) = \frac{1}{2 ws}\sum_{k=1}^{p-ws}1 - \rho(x_{i,(k:k+ws)}, x_{j,(k:k+ws)})}{d(x_i, x_j) = 1/(2 ws)\sum_(k=1)^{p-ws}(1 - \rho(x_(i,k:k+ws), x_(j,k:k+ws)))} where \mjeqn{ws}{ws} represents a given window size which rolls sequentially from 1 up to \mjeqn{p - ws}{p - ws} and \mjeqn{p}{p} is the number of @@ -72,5 +74,5 @@ cor_diss(Xr = Xr, Xu = Xu, ws = 41) } } \author{ -Antoine Stevens and Leonardo Ramirez-Lopez +Antoine Stevens and \href{https://orcid.org/0000-0002-5369-5120}{Leonardo Ramirez-Lopez} } diff --git a/man/f_diss.Rd b/man/f_diss.Rd index 8ef369b..26d0243 100644 --- a/man/f_diss.Rd +++ b/man/f_diss.Rd @@ -39,23 +39,21 @@ based on Euclidean or Mahalanobis distance measures or on cosine dissimilarity measures (a.k.a spectral angle mapper). } \details{ -the base [`dist`][base::dist()] - The results obtained for Euclidean dissimilarity are equivalent to those -returned by the base [`dist`][base::dist()] function, but are scaled +returned by the [stats::dist()] function, but are scaled differently. However, \code{f_diss} is considerably faster (which can be advantageous when computing dissimilarities for very large matrices). The final scaling of the dissimilarity scores in \code{f_diss} where the number of variables is used to scale the squared dissimilarity scores. See -the examples section for a comparison between [`dist`][base::dist()] and +the examples section for a comparison between [stats::dist()] and \code{f_diss}. In the case of both the Euclidean and Mahalanobis distances, the scaled dissimilarity matrix \mjeqn{D}{D} between between observations in a given matrix \mjeqn{X}{X} is computed as follows: -\mjdeqn{D(x_i, x_j)^{2} = (x_i - x_j)M^{-1}(x_i - x_j)^{\mathrm{T}}}{D(x_i, x_j)^{2} = (x_i - x_j)M^{-1}(x_i - x_j)^T} -\mjdeqn{D_{scaled}(x_i, x_j) = \sqrt{\frac{1}{p}D(x_i, x_j)^{2}}}{D_scaled (x_i, x_j) = sqrt(1/p D(x_i, x_j)^2)} +\mjdeqn{d(x_i, x_j)^{2} = \sum (x_i - x_j)M^{-1}(x_i - x_j)^{\mathrm{T}}}{D(x_i, x_j)^{2} = \sum (x_i - x_j)M^{-1}(x_i - x_j)^T} +\mjdeqn{d_{scaled}(x_i, x_j) = \sqrt{\frac{1}{p}d(x_i, x_j)^{2}}}{d_scaled (x_i, x_j) = sqrt(1/p d(x_i, x_j)^2)} where \mjeqn{p}{p} is the number of variables in \mjeqn{X}{X}, \mjeqn{M}{M} is the identity matrix in the case of the Euclidean distance and the variance-covariance @@ -76,10 +74,10 @@ smaller than the number of variables. For the computation of the Mahalanobis distance, the mentioned method is used. -The cosine dissimilarity \mjeqn{S}{S} between two observations +The cosine dissimilarity \mjeqn{c}{c} between two observations \mjeqn{x_i}{x_i} and \mjeqn{x_j}{x_j} is computed as follows: -\mjdeqn{S(x_i, x_j) = cos^{-1}{\frac{\sum_{k=1}^{p}x_{i,k} x_{j,k}}{\sqrt{\sum_{k=1}^{p} x_{i,k}^{2}} \sqrt{\sum_{k=1}^{p} x_{j,k}^{2}}}}}{S(x_i, x_j) = cos^{-1} ((sum_(k=1)^p x_(i,k) x_(j,k))/(sum_(k=1)^p x_(i,k) sum_(k=1)^p x_(j,k)))} +\mjdeqn{c(x_i, x_j) = cos^{-1}{\frac{\sum_{k=1}^{p}x_{i,k} x_{j,k}}{\sqrt{\sum_{k=1}^{p} x_{i,k}^{2}} \sqrt{\sum_{k=1}^{p} x_{j,k}^{2}}}}}{c(x_i, x_j) = cos^{-1} ((sum_(k=1)^p x_(i,k) x_(j,k))/(sum_(k=1)^p x_(i,k) sum_(k=1)^p x_(j,k)))} where \mjeqn{p}{p} is the number of variables of the observations. The function does not accept input data containing missing values. @@ -130,5 +128,5 @@ cdiss_xr_xu <- f_diss(Xr, Xu, "cosine") } } \author{ -Leonardo Ramirez-Lopez and Antoine Stevens +\href{https://orcid.org/0000-0002-5369-5120}{Leonardo Ramirez-Lopez} and Antoine Stevens } diff --git a/man/get_predictions.Rd b/man/get_predictions.Rd index e753861..a08634f 100644 --- a/man/get_predictions.Rd +++ b/man/get_predictions.Rd @@ -21,5 +21,5 @@ Extract predictions from an object of class \code{mbl} \code{\link{mbl}} } \author{ -Leonardo Ramirez-Lopez and Antoine Stevens +\href{https://orcid.org/0000-0002-5369-5120}{Leonardo Ramirez-Lopez} and Antoine Stevens } diff --git a/man/get_sample_strata.Rd b/man/get_sample_strata.Rd new file mode 100644 index 0000000..15fbc72 --- /dev/null +++ b/man/get_sample_strata.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sample_stratified.R +\name{get_sample_strata} +\alias{get_sample_strata} +\title{A function to assign values to sample distribution strata} +\usage{ +get_sample_strata(y, n) +} +\arguments{ +\item{y}{a matrix of one column with the response variable.} + +\item{n}{the number of strata.} +} +\value{ +a data table with the input \code{y} and the corresponding strata to +every value. +} +\description{ +for internal use only! This function takes a continuous variable, +creates n strata based on its distribution and assigns the corresponding starta +to every value. +} +\keyword{internal} diff --git a/man/get_samples_from_strata.Rd b/man/get_samples_from_strata.Rd new file mode 100644 index 0000000..4e92d2d --- /dev/null +++ b/man/get_samples_from_strata.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sample_stratified.R +\name{get_samples_from_strata} +\alias{get_samples_from_strata} +\title{A function for stratified calibration/validation sampling} +\usage{ +get_samples_from_strata( + original_order, + strata, + samples_per_strata, + sampling_for = c("calibration", "validation"), + replacement = FALSE +) +} +\arguments{ +\item{original_order}{a matrix of one column with the response variable.} + +\item{sampling_for}{sampling to select the calibration samples ("calibration") +or sampling to select the validation samples ("validation").} + +\item{replacement}{logical indicating if sampling with replacement must be +done.} + +\item{starta}{the number of strata.} +} +\value{ +a list with the indices of the calibration and validation samples. +} +\description{ +for internal use only! This function selects samples +based on provided strata. +} +\keyword{internal} diff --git a/man/ithrssubsets.Rd b/man/ithrssubsets.Rd deleted file mode 100644 index 1da09b6..0000000 --- a/man/ithrssubsets.Rd +++ /dev/null @@ -1,34 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/rslocal_helpers.R -\name{ithrssubsets} -\alias{ithrssubsets} -\title{iterator that subsets a matrix based on input indices for pls modeling -at each rs-local iteration} -\usage{ -ithrssubsets(x, y, group = NULL, indx) -} -\arguments{ -\item{x}{an input matrix of predictors} - -\item{y}{a response variable} - -\item{group}{a variable giving the groups/calsses to which the observations -belong to (used for avoiding pseudo-replication during validation)} - -\item{indx}{the indices to retrieve} -} -\value{ -an object of \code{class} iterator giving a \code{list} with (a) a -subset of the input matrix with rows re-arranged according to `indx` -} -\description{ -internal -} -\details{ -this is designed to be used within (parallel) loops to avoid sending -entire matrices to each core. It just sends the subset of that is required. -} -\author{ -Leonardo Ramirez-Lopez -} -\keyword{internal} diff --git a/man/local_fit.Rd b/man/local_fit.Rd index 378d55b..99db434 100644 --- a/man/local_fit.Rd +++ b/man/local_fit.Rd @@ -107,5 +107,5 @@ Massachusetts Institute of Technology: MIT-Press, 2006. \code{\link{mbl}} } \author{ -Leonardo Ramirez-Lopez +\href{https://orcid.org/0000-0002-5369-5120}{Leonardo Ramirez-Lopez} } diff --git a/man/mbl.Rd b/man/mbl.Rd index f547b65..1217774 100644 --- a/man/mbl.Rd +++ b/man/mbl.Rd @@ -121,7 +121,7 @@ This list must contain two elements in the following order: components) and \code{value} (a numerical value that complements the selected method). The methods available are: \itemize{ - \item{\code{"opc"}:} {optimized principal component selection based on + \item{\code{"opc"}:} { optimized principal component selection based on Ramirez-Lopez et al. (2013a, 2013b). The optimal number of components (of set of observations) is the one for which its distance matrix minimizes the differences between the \code{Yr} value of each @@ -131,17 +131,17 @@ method). The methods available are: number of principal components to be tested. See the \code{\link{ortho_projection}} function for more details.} - \item{\code{"cumvar"}:}{selection of the principal components based + \item{\code{"cumvar"}:}{ selection of the principal components based on a given cumulative amount of explained variance. In this case, \code{value} must be a value (larger than 0 and below or equal to 1) - indicating the maximum amount of cumulative variance that the - retained components should explain.} + indicating the minimum amount of cumulative variance that the + combination of retained components should explain.} - \item{\code{"var"}:}{selection of the principal components based + \item{\code{"var"}:}{ selection of the principal components based on a given amount of explained variance. In this case, \code{value} must be a value (larger than 0 and below or equal to 1) - indicating the minimum amount of variance that a component should - explain in order to be retained.} + indicating the minimum amount of variance that a single component + should explain in order to be retained.} \item{\code{"manual"}:}{ for manually specifying a fix number of principal components. In this case, \code{value} must be a value @@ -149,12 +149,12 @@ method). The methods available are: indicating the minimum amount of variance that a component should explain in order to be retained.} } -The default list passed is \code{list(method = "cumvar", value = 0.99)}. +The default list passed is \code{list(method = "opc", value = min(dim(Xr), 40))}. Optionally, the \code{pc_selection} argument admits \code{"opc"} or \code{"cumvar"} or \code{"var"} or \code{"manual"} as a single character string. In such a case the default \code{"value"} when either \code{"opc"} or \code{"manual"} are used is 40. When \code{"cumvar"} is used the default -\code{"value"} is set to 0.99 and when \code{"var"} is used the default +\code{"value"} is set to 0.99 and when \code{"var"} is used, the default \code{"value"} is set to 0.01.} \item{control}{a list created with the \code{\link{mbl_control}} function @@ -373,25 +373,24 @@ each local segment. library(prospectr) data(NIRsoil) -# Filter the data using the Savitzky and Golay smoothing filter with -# a window size of 11 spectral variables and a polynomial order of 3 -# (no differentiation). -sg <- savitzkyGolay(NIRsoil$spc, p = 3, w = 11, m = 0) - -# Replace the original spectra with the filtered ones -NIRsoil$spc <- sg - -Xu <- NIRsoil$spc[!as.logical(NIRsoil$train), ] -Yu <- NIRsoil$CEC[!as.logical(NIRsoil$train)] +# Proprocess the data using detrend plus first derivative with Savitzky and +# Golay smoothing filter +sg_det <- savitzkyGolay( + detrend(NIRsoil$spc, + wav = as.numeric(colnames(NIRsoil$spc))), + m = 1, + p = 1, + w = 7 +) -Yr <- NIRsoil$CEC[as.logical(NIRsoil$train)] -Xr <- NIRsoil$spc[as.logical(NIRsoil$train), ] +NIRsoil$spc_pr <- sg_det -Xu <- Xu[!is.na(Yu), ] -Xr <- Xr[!is.na(Yr), ] +# split into training and testing sets +test_x <- NIRsoil$spc_pr[NIRsoil$train == 0 & !is.na(NIRsoil$CEC),] +test_y <- NIRsoil$CEC[NIRsoil$train == 0 & !is.na(NIRsoil$CEC)] -Yu <- Yu[!is.na(Yu)] -Yr <- Yr[!is.na(Yr)] +train_y <- NIRsoil$CEC[NIRsoil$train == 1 & !is.na(NIRsoil$CEC)] +train_x <- NIRsoil$spc_pr[NIRsoil$train == 1 & !is.na(NIRsoil$CEC),] # Example 1 # A mbl implemented in Ramirez-Lopez et al. (2013, @@ -405,10 +404,13 @@ my_control <- mbl_control(validation_type = "NNv") ks <- seq(40, 140, by = 20) sbl <- mbl( - Xr = Xr, Yr = Yr, Xu = Xu, + Xr = train_x, + Yr = train_y, + Xu = test_x, k = ks, method = local_fit_gpr(), control = my_control, + scale = TRUE ) sbl plot(sbl) @@ -417,10 +419,13 @@ get_predictions(sbl) # Example 1.2 # If Yu is actually known... sbl_2 <- mbl( - Xr = Xr, Yr = Yr, Xu = Xu, Yu = Yu, + Xr = train_x, + Yr = train_y, + Xu = test_x, + Yu = test_y, k = ks, method = local_fit_gpr(), - control = my_control, + control = my_control ) sbl_2 plot(sbl_2) @@ -428,12 +433,15 @@ plot(sbl_2) # Example 2 # the LOCAL algorithm (Shenk et al., 1997) local_algorithm <- mbl( - Xr = Xr, Yr = Yr, Xu = Xu, Yu = Yu, + Xr = train_x, + Yr = train_y, + Xu = test_x, + Yu = test_y, k = ks, method = local_fit_wapls(min_pls_c = 3, max_pls_c = 15), diss_method = "cor", diss_usage = "none", - control = my_control, + control = my_control ) local_algorithm plot(local_algorithm) @@ -443,44 +451,46 @@ plot(local_algorithm) # dissmilarity matrix) and dissimilarity matrix as source of # additional preditors local_algorithm_2 <- mbl( - Xr = Xr, Yr = Yr, Xu = Xu, Yu = Yu, + Xr = train_x, + Yr = train_y, + Xu = test_x, + Yu = test_y, k = ks, method = local_fit_wapls(min_pls_c = 3, max_pls_c = 15), diss_method = "pca", diss_usage = "predictors", - control = my_control, + control = my_control ) local_algorithm_2 -plot(local_algorithm) +plot(local_algorithm_2) # Example 4 # Running the mbl function in parallel with example 2 -library(future) - -n_cores <- availableCores() - 1 +n_cores <- parallel::detectCores() - 1 if (n_cores == 0) { n_cores <- 1 } -# Set the number of cores according to the OS -if (.Platform$OS.type == "windows") { - library(doParallel) - clust <- makeCluster(n_cores) - registerDoParallel(clust) -} else { - library(doSNOW) - clust <- makeCluster(n_cores, type = "SOCK") - registerDoSNOW(clust) - getDoParWorkers() -} +library(doParallel) +clust <- makeCluster(n_cores) +registerDoParallel(clust) + +# Alernatively: +# library(doSNOW) +# clust <- makeCluster(n_cores, type = "SOCK") +# registerDoSNOW(clust) +# getDoParWorkers() local_algorithm_par <- mbl( - Xr = Xr, Yr = Yr, Xu = Xu, Yu = Yu, + Xr = train_x, + Yr = train_y, + Xu = test_x, + Yu = test_y, k = ks, method = local_fit_wapls(min_pls_c = 3, max_pls_c = 15), diss_method = "cor", diss_usage = "none", - control = my_control, + control = my_control ) local_algorithm_par @@ -490,13 +500,17 @@ try(stopCluster(clust)) # Example 5 # Using local pls distances with_local_diss <- mbl( - Xr = Xr, Yr = Yr, Xu = Xu, Yu = Yu, + Xr = train_x, + Yr = train_y, + Xu = test_x, + Yu = test_y, k = ks, method = local_fit_wapls(min_pls_c = 3, max_pls_c = 15), diss_method = "pls", diss_usage = "predictors", control = my_control, - .local = TRUE, pre_k = 150, + .local = TRUE, + pre_k = 150, ) with_local_diss plot(with_local_diss) @@ -536,5 +550,5 @@ Infrared Spectroscopy, 5, 223-232. \code{\link{search_neighbors}} } \author{ -Leonardo Ramirez-Lopez and Antoine Stevens +\href{https://orcid.org/0000-0002-5369-5120}{Leonardo Ramirez-Lopez} and Antoine Stevens } diff --git a/man/mbl_control.Rd b/man/mbl_control.Rd index 372180d..c89c202 100644 --- a/man/mbl_control.Rd +++ b/man/mbl_control.Rd @@ -26,16 +26,16 @@ below).} \item{tune_locally}{a logical. It only applies when \code{validation_type = "local_cv"} and "pls" or "wapls" fitting algorithms are -used. If \code{TRUE}, the the parameters of the local pls-based models +used. If \code{TRUE}, then the parameters of the local pls-based models (i.e. pls factors for the "pls" method and minimum and maximum pls factors -for the "wapls" method). Default is #' \code{TRUE}.} +for the "wapls" method) are optimized. Default is \code{TRUE}.} \item{number}{an integer indicating the number of sampling iterations at each local segment when \code{"local_cv"} is selected in the \code{validation_type} argument. Default is 10.} -\item{p}{a numeric value indicating the percentage of observations to be retained -at each sampling iteration at each local segment when \code{"local_cv"} +\item{p}{a numeric value indicating the percentage of calibration observations +to be retained at each sampling iteration at each local segment when \code{"local_cv"} is selected in the \code{validation_type} argument. Default is 0.75 (i.e. 75 "\%").} \item{range_prediction_limits}{a logical. It indicates whether the prediction @@ -74,24 +74,28 @@ the memory-based learning method used are described as follows: the group of neighbors of each observation to be predicted, the nearest observation (i.e. the most similar observation) is excluded and then a local model is fitted using the remaining neighbors. This model is then used to predict the value - of the target response variable of the nearest observation. These predicted + of the response variable of the nearest observation. These predicted values are finally cross validated with the actual values (See Ramirez-Lopez et al. (2013a) for additional details). This method is faster than \code{"local_cv"}.} \item{Local leave-group-out cross-validation (\code{"local_cv"}):}{ The group of neighbors of each observation to be predicted is partitioned into different equal size subsets. Each partition is selected based on a - stratified random sampling which takes into account the values of the - response variable of the corresponding set of neighbors. The selected - local subset is used as local validation subset and the remaining observations - are used for fitting a model. This model is used to predict the target - response variable values of the local validation subset and the local root - mean square error is computed. This process is repeated \mjeqn{m}{m} times and - the final local error is computed as the average of the local root mean - square error of all the \mjeqn{m}{m} iterations. In the \code{mbl} function - \mjeqn{m}{m} is controlled by the \code{number} argument and the size of the - subsets is controlled by the \code{p} argument which indicates the - percentage of observations to be selected from the subset of nearest neighbours. + stratified random sampling that uses the the distribution of + the response variable in the corresponding set of neighbors. When + \code{p} \mjeqn{>=}{\geqslant} 0.5 (i.e. the number of calibration + observations to retain is larger than 50% of the total samples in the neighborhood), + the sampling is conducted for selecting the validation samples, and when + \code{p} < 0.5 the sampling is conducted for selecting the calibration + samples (samples used for model fitting). The model fitted with the selected + calibration samples is used to predict the response values of the local + validation samples and the local root mean square error is computed. + This process is repeated \mjeqn{m}{m} times and the final local + error is computed as the average of the local root mean square errors + obtained for all the \mjeqn{m}{m} iterations. In the \code{mbl_control} function + \mjeqn{m}{m} is controlled by the \code{number} argument and the size of the + subsets is controlled by the \code{p} argument which indicates the + percentage of observations to be selected from the subset of nearest neighbors. The global error of the predictions is computed as the average of the local root mean square errors.} \item{No validation (\code{"none"}):}{ No validation is carried out. @@ -118,5 +122,5 @@ use with soil vis-NIR spectra. Geoderma 199, 43-53. \code{\link{ortho_diss}}, \code{\link{mbl}} } \author{ -Leonardo Ramirez-Lopez and Antoine Stevens +\href{https://orcid.org/0000-0002-5369-5120}{Leonardo Ramirez-Lopez} and Antoine Stevens } diff --git a/man/opls_for_projection.Rd b/man/opls_for_projection.Rd index fa4a3d7..28c0b91 100644 --- a/man/opls_for_projection.Rd +++ b/man/opls_for_projection.Rd @@ -6,8 +6,8 @@ \usage{ opls_for_projection(X, Y, ncomp, scale, maxiter, tol, - pcSelmethod = "cumvar", - pcSelvalue = 0.99) + pcSelmethod = "var", + pcSelvalue = 0.01) } \arguments{ \item{X}{a matrix of predictor variables.} diff --git a/man/ortho_diss.Rd b/man/ortho_diss.Rd index ef0b47a..1c8eea4 100644 --- a/man/ortho_diss.Rd +++ b/man/ortho_diss.Rd @@ -7,7 +7,7 @@ projections (ortho_diss)} \usage{ ortho_diss(Xr, Xu = NULL, Yr = NULL, - pc_selection = list(method = "cumvar", value = 0.99), + pc_selection = list(method = "var", value = 0.01), diss_method = "pca", .local = FALSE, pre_k, @@ -60,14 +60,14 @@ method). The methods available are: \item{\code{"cumvar"}:}{ selection of the principal components based on a given cumulative amount of explained variance. In this case, \code{value} must be a value (larger than 0 and below or equal to 1) - indicating the maximum amount of cumulative variance that the - retained components should explain.} + indicating the minimum amount of cumulative variance that the + combination of retained components should explain.} \item{\code{"var"}:}{ selection of the principal components based on a given amount of explained variance. In this case, \code{value} must be a value (larger than 0 and below or equal to 1) - indicating the minimum amount of variance that a component should - explain in order to be retained.} + indicating the minimum amount of variance that a single component + should explain in order to be retained.} \item{\code{"manual"}:}{ for manually specifying a fix number of principal components. In this case, \code{value} must be a value @@ -75,12 +75,12 @@ method). The methods available are: indicating the minimum amount of variance that a component should explain in order to be retained.} } -The default list passed is \code{list(method = "cumvar", value = 0.99)}. +The default list passed is \code{list(method = "var", value = 0.01)}. Optionally, the \code{pc_selection} argument admits \code{"opc"} or \code{"cumvar"} or \code{"var"} or \code{"manual"} as a single character string. In such case, the default \code{"value"} when either \code{"opc"} or \code{"manual"} are used is 40. When \code{"cumvar"} is used the default -\code{"value"} is set to 0.99 and when \code{"var"} is used the default +\code{"value"} is set to 0.99 and when \code{"var"} is used, the default \code{"value"} is set to 0.01.} \item{diss_method}{a character value indicating the type of projection on which @@ -149,7 +149,7 @@ a \code{list} of class \code{ortho_diss} with the following elements: between each target observation and its neighbor observations.} \item{\code{dissimilarity}}{ the computed dissimilarity matrix. If \code{.local = FALSE} a distance matrix. If \code{.local = TRUE} a matrix of - class \code{localortho_diss}. In this case, each column represent the dissimilarity + class \code{local_ortho_diss}. In this case, each column represent the dissimilarity between a target observation and its neighbor observations.} \item{\code{projection}}{if \code{return_projection = TRUE}, an \code{ortho_projection} object.} @@ -172,7 +172,7 @@ observation, a given set of nearest neighbors (\code{pre_k}) are identified. These neighbors (together with the target observation) are projected (from the original data space) onto a (local) orthogonal space (using the same parameters specified in the function). In this projected space the -Mahalanobis distance between the target observation and the neighbors is +Mahalanobis distance between the target observation and its neighbors is recomputed. A missing value is assigned to the observations that do not belong to this set of neighbors (non-neighbor observations). In this case the dissimilarity matrix cannot be considered as a distance @@ -245,5 +245,5 @@ with soil vis-NIR spectra. Geoderma 199, 43-53. \code{\link{ortho_projection}}, \code{\link{sim_eval}} } \author{ -Leonardo Ramirez-Lopez +\href{https://orcid.org/0000-0002-5369-5120}{Leonardo Ramirez-Lopez} } diff --git a/man/ortho_projection.Rd b/man/ortho_projection.Rd index b416a1f..bc8b259 100644 --- a/man/ortho_projection.Rd +++ b/man/ortho_projection.Rd @@ -11,11 +11,11 @@ least squares} ortho_projection(Xr, Xu = NULL, Yr = NULL, method = "pca", - pc_selection = list(method = "cumvar", value = 0.99), + pc_selection = list(method = "var", value = 0.01), center = TRUE, scale = FALSE, ...) pc_projection(Xr, Xu = NULL, Yr = NULL, - pc_selection = list(method = "cumvar", value = 0.99), + pc_selection = list(method = "var", value = 0.01), center = TRUE, scale = FALSE, method = "pca", tol = 1e-6, max_iter = 1000, ...) @@ -67,13 +67,13 @@ method). The methods available are: \item{\code{"cumvar"}:}{ selection of the principal components based on a given cumulative amount of explained variance. In this case, \code{value} must be a value (larger than 0 and below or equal to 1) - indicating the maximum amount of cumulative variance that the - retained components should explain.} + indicating the minimum amount of cumulative variance that the + combination of retained components should explain.} \item{\code{"var"}:}{ selection of the principal components based on a given amount of explained variance. In this case, \code{value} must be a value (larger than 0 and below or equal to 1) - indicating the minimum amount of variance that a component should + indicating the minimum amount of variance that a single component should explain in order to be retained.} \item{\code{"manual"}:}{ for manually specifying a fix number of @@ -82,12 +82,12 @@ method). The methods available are: indicating the minimum amount of variance that a component should explain in order to be retained.} } -The default list passed is \code{list(method = "cumvar", value = 0.99)}. +The default list passed is \code{list(method = "var", value = 0.01)}. Optionally, the \code{pc_selection} argument admits \code{"opc"} or \code{"cumvar"} or \code{"var"} or \code{"manual"} as a single character string. In such a case the default \code{"value"} when either \code{"opc"} or \code{"manual"} are used is 40. When \code{"cumvar"} is used the default -\code{"value"} is set to 0.99 and when \code{"var"} is used the default +\code{"value"} is set to 0.99 and when \code{"var"} is used, the default \code{"value"} is set to 0.01.} \item{center}{a logical indicating if the data \code{Xr} (and \code{Xu} if @@ -139,8 +139,9 @@ components: onto a "pls" space. This object is only returned if the "pls" algorithm was used.} \item{\code{variance}}{ a matrix indicating the standard deviation of each - component (sd), the cumulative explained variance (cumulative_explained_var) and the - variance explained by each single component (explained_var). These values are + component (sd), the variance explained by each single component + (explained_var) and the cumulative explained variance + (cumulative_explained_var). These values are computed based on the data used to create the projection matrices. For example if the "pls" method was used, then these values are computed based only on the data that contains information on \code{Yr} (i.e. the @@ -216,25 +217,32 @@ via OpenMP in Rcpp. library(prospectr) data(NIRsoil) -Xu <- NIRsoil$spc[!as.logical(NIRsoil$train), ] -Yu <- NIRsoil[!as.logical(NIRsoil$train), "CEC", drop = FALSE] -Yr <- NIRsoil[as.logical(NIRsoil$train), "CEC", drop = FALSE] -Xr <- NIRsoil$spc[as.logical(NIRsoil$train), ] +# Proprocess the data using detrend plus first derivative with Savitzky and +# Golay smoothing filter +sg_det <- savitzkyGolay( + detrend(NIRsoil$spc, + wav = as.numeric(colnames(NIRsoil$spc))), + m = 1, + p = 1, + w = 7 +) +NIRsoil$spc_pr <- sg_det -Xu <- Xu[!is.na(Yu), ] -Yu <- Yu[!is.na(Yu), , drop = FALSE] +# split into training and testing sets +test_x <- NIRsoil$spc_pr[NIRsoil$train == 0 & !is.na(NIRsoil$CEC),] +test_y <- NIRsoil$CEC[NIRsoil$train == 0 & !is.na(NIRsoil$CEC)] -Xr <- Xr[!is.na(Yr), ] -Yr <- Yr[!is.na(Yr), , drop = FALSE] +train_y <- NIRsoil$CEC[NIRsoil$train == 1 & !is.na(NIRsoil$CEC)] +train_x <- NIRsoil$spc_pr[NIRsoil$train == 1 & !is.na(NIRsoil$CEC),] # A principal component analysis using 5 components -pca_projected <- ortho_projection(Xr, pc_selection = list("manual", 5)) +pca_projected <- ortho_projection(train_x, pc_selection = list("manual", 5)) pca_projected # A principal components projection using the "opc" method # for the selection of the optimal number of components pca_projected_2 <- ortho_projection( - Xr, Xu, Yr, + Xr = train_x, Xu = test_x, Yr, method = "pca", pc_selection = list("opc", 40) ) @@ -244,7 +252,7 @@ plot(pca_projected_2) # A partial least squares projection using the "opc" method # for the selection of the optimal number of components pls_projected <- ortho_projection( - Xr, Xu, Yr, + Xr = train_x, Xu = test_x, Yr, method = "pls", pc_selection = list("opc", 40) ) @@ -254,7 +262,7 @@ plot(pls_projected) # A partial least squares projection using the "cumvar" method # for the selection of the optimal number of components pls_projected_2 <- ortho_projection( - Xr = Xr, Yr = Yr, Xu = Xu, + Xr = train_x, Yr = train_y, Xu = test_x, method = "pls", pc_selection = list("cumvar", 0.99) ) @@ -275,5 +283,5 @@ with soil vis-NIR spectra. Geoderma 199, 43-53. \code{\link{ortho_diss}}, \code{\link{sim_eval}}, \code{\link{mbl}} } \author{ -Leonardo Ramirez-Lopez +\href{https://orcid.org/0000-0002-5369-5120}{Leonardo Ramirez-Lopez} } diff --git a/man/pca_nipals.Rd b/man/pca_nipals.Rd index 6f63773..38b0b35 100644 --- a/man/pca_nipals.Rd +++ b/man/pca_nipals.Rd @@ -6,8 +6,8 @@ \usage{ pca_nipals(X, ncomp, center, scale, maxiter, tol, - pcSelmethod = "cumvar", - pcSelvalue = 0.99) + pcSelmethod = "var", + pcSelvalue = 0.01) } \arguments{ \item{X}{a matrix of predictor variables.} @@ -23,14 +23,14 @@ pca_nipals(X, ncomp, center, scale, \item{pcSelmethod}{the method for selecting the number of components. Options are: \code{'cumvar'} (for selecting the number of principal components based on a given cumulative amount of explained variance) and \code{"var"} (for selecting the number of principal -components based on a given amount of explained variance). Default is \code{'cumvar'}} +components based on a given amount of explained variance). Default is \code{'var'}} \item{pcSelvalue}{a numerical value that complements the selected method (\code{pcSelmethod}). If \code{"cumvar"} is chosen, it must be a value (larger than 0 and below 1) indicating the maximum amount of cumulative variance that the retained components should explain. If \code{"var"} is chosen, it must be a value (larger than 0 and below 1) indicating that components that explain (individually) a variance lower than this threshold must be excluded. If \code{"manual"} is chosen, it must be a value -specifying the desired number of principal components to retain. Default is 0.99.} +specifying the desired number of principal components to retain. Default is 0.01.} \item{Y}{a matrix of either a single or multiple response variables.} } diff --git a/man/plot.ortho_projection.Rd b/man/plot.ortho_projection.Rd index 69f63c7..dcba34d 100644 --- a/man/plot.ortho_projection.Rd +++ b/man/plot.ortho_projection.Rd @@ -4,11 +4,13 @@ \alias{plot.ortho_projection} \title{Plot method for an object of class \code{ortho_projection}} \usage{ -\method{plot}{ortho_projection}(x, ...) +\method{plot}{ortho_projection}(x, col = "dodgerblue", ...) } \arguments{ \item{x}{an object of class \code{ortho_projection} (as returned by \code{ortho_projection}).} +\item{col}{the color of the plots (default is "dodgerblue")} + \item{...}{arguments to be passed to methods.} } \description{ diff --git a/man/print.local_ortho_diss.Rd b/man/print.local_ortho_diss.Rd index 6decfda..8fed70b 100644 --- a/man/print.local_ortho_diss.Rd +++ b/man/print.local_ortho_diss.Rd @@ -7,7 +7,7 @@ \method{print}{local_ortho_diss}(x, ...) } \arguments{ -\item{x}{an object of class \code{localortho_diss} (returned by +\item{x}{an object of class \code{local_ortho_diss} (returned by \code{ortho_diss} when it uses \code{.local = TRUE}).} \item{...}{arguments to be passed to methods (not yet functional).} diff --git a/man/resemble-package.Rd b/man/resemble-package.Rd index eb92e09..d092840 100644 --- a/man/resemble-package.Rd +++ b/man/resemble-package.Rd @@ -17,7 +17,10 @@ implements a number of \code{R} functions useful for modeling complex spectral spectra (e.g. NIR, IR). The package includes functions for dimensionality reduction, computing spectral dissimilarity matrices, nearest neighbor search, -and modeling spectral data using memory-based learning. +and modeling spectral data using memory-based learning. + +Development versions can be found in the github repository of the package +at \href{https://github.com/l-ramirez-lopez/resemble}{https://github.com/l-ramirez-lopez/resemble}. The functions available for dimensionality reduction are: \itemize{ @@ -54,15 +57,15 @@ Other supplementary functions: } } \author{ -Leonardo Ramirez-Lopez +\href{https://orcid.org/0000-0002-5369-5120}{Leonardo Ramirez-Lopez} [aut, cre] -Antoine Stevens +\href{https://orcid.org/0000-0002-1588-7519}{Antoine Stevens} [ctb] -Raphael Viscarra Rossel +\href{https://orcid.org/0000-0003-1540-4748}{Raphael Viscarra Rossel} [ctb] -Craig Lobsey +\href{https://orcid.org/0000-0001-5416-8640}{Craig Lobsey} [ctb] -Alex Wadoux +\href{https://orcid.org/0000-0001-7325-9716}{Alex Wadoux} [ctb] -Timo Breure +\href{https://orcid.org/0000-0001-5695-8064}{Timo Breure} [ctb] } diff --git a/man/sample_str.Rd b/man/sample_str.Rd deleted file mode 100644 index 9638a2c..0000000 --- a/man/sample_str.Rd +++ /dev/null @@ -1,33 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/local_helpers.R -\name{sample_str} -\alias{sample_str} -\title{A function to create the sample sets for calibration/validation in -cross-validation.} -\usage{ -sample_str(y, p, number, group = NULL) -} -\arguments{ -\item{y}{a matrix of one column with the response variable.} - -\item{p}{the percentage of samples (or groups if group argument is used) to -retain in the hold_in set} - -\item{number}{the number of sample groups to be crated} - -\item{group}{the labels for each sample in \code{y} indicating the group each -observation belongs to.} -} -\value{ -a list with two matrices (\code{hold_in} and \code{hold_out}) giving -the indices of the observations in each column. The number of colums represents -the number of sampling repetitions. -} -\description{ -for internal use only! This is stratified sampling based on the -values of a continous response variable (y). If group is provided, the -sampling is done based on the groups and the average of y per group. This -function is used to create groups forleave-group-out cross-validations (or -leave-group-of-groups-out cross-validation if group argument is provided). -} -\keyword{internal} diff --git a/man/sample_stratified.Rd b/man/sample_stratified.Rd new file mode 100644 index 0000000..6d7f773 --- /dev/null +++ b/man/sample_stratified.Rd @@ -0,0 +1,38 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sample_stratified.R +\name{sample_stratified} +\alias{sample_stratified} +\title{A function to create calibration and validation sample sets for +leave-group-out and bootstraping cross-validation} +\usage{ +sample_stratified(y, p, number, group = NULL, replacement = FALSE) +} +\arguments{ +\item{y}{a matrix of one column with the response variable.} + +\item{p}{the percentage of samples (or groups if group argument is used) to +retain in the validation_indices set} + +\item{number}{the number of sample groups to be crated} + +\item{group}{the labels for each sample in \code{y} indicating the group each +observation belongs to.} + +\item{replacement}{A logical indicating sample replacements for the +calibration set are required.} +} +\value{ +a list with two matrices (\code{hold_in} and +\code{hold_out}) giving the indices of the observations in each +column. The number of columns represents the number of sampling repetitions. +} +\description{ +for internal use only! This is stratified sampling based on the +values of a continuous response variable (y). If group is provided, the +sampling is done based on the groups and the average of y per group. This +function is used to create calibration and validation groups for +leave-group-out cross-validations (or +leave-group-of-groups-out cross-validation if group argument is provided) +and/or bootstrapping. +} +\keyword{internal} diff --git a/man/search_neighbors.Rd b/man/search_neighbors.Rd index e3c1058..38b49f0 100644 --- a/man/search_neighbors.Rd +++ b/man/search_neighbors.Rd @@ -8,7 +8,7 @@ another given set of observations (search_neighbors)} search_neighbors(Xr, Xu, diss_method = c("pca", "pca.nipals", "pls", "cor", "euclid", "cosine", "sid"), Yr = NULL, k, k_diss, k_range, spike = NULL, - pc_selection = list("cumvar", 0.99), + pc_selection = list("var", 0.01), return_projection = FALSE, return_dissimilarity = FALSE, ws = NULL, center = TRUE, scale = FALSE, @@ -90,7 +90,7 @@ to be retained. This list must contain two elements in the following order: components) and \code{value} (a numerical value that complements the selected method). The methods available are: \itemize{ - \item{\code{"opc"}:} {optimized principal component selection based on + \item{\code{"opc"}:} { optimized principal component selection based on Ramirez-Lopez et al. (2013a, 2013b). The optimal number of components (of set of observations) is the one for which its distance matrix minimizes the differences between the \code{Yr} value of each @@ -100,17 +100,17 @@ method). The methods available are: number of principal components to be tested. See the \code{\link{ortho_projection}} function for more details.} - \item{\code{"cumvar"}:}{selection of the principal components based + \item{\code{"cumvar"}:}{ selection of the principal components based on a given cumulative amount of explained variance. In this case, \code{value} must be a value (larger than 0 and below or equal to 1) - indicating the maximum amount of cumulative variance that the - retained components should explain.} + indicating the minimum amount of cumulative variance that the + combination of retained components should explain.} - \item{\code{"var"}:}{selection of the principal components based + \item{\code{"var"}:}{ selection of the principal components based on a given amount of explained variance. In this case, \code{value} must be a value (larger than 0 and below or equal to 1) - indicating the minimum amount of variance that a component should - explain in order to be retained.} + indicating the minimum amount of variance that a single component + should explain in order to be retained.} \item{\code{"manual"}:}{ for manually specifying a fix number of principal components. In this case, \code{value} must be a value @@ -118,12 +118,12 @@ method). The methods available are: indicating the minimum amount of variance that a component should explain in order to be retained.} } -The default list passed is \code{list(method = "cumvar", value = 0.99)}. +The default list passed is \code{list(method = "var", value = 0.01)}. Optionally, the \code{pc_selection} argument admits \code{"opc"} or \code{"cumvar"} or \code{"var"} or \code{"manual"} as a single character string. In such a case the default \code{"value"} when either \code{"opc"} or \code{"manual"} are used is 40. When \code{"cumvar"} is used the default -\code{"value"} is set to 0.99 and when \code{"var"} is used the default +\code{"value"} is set to 0.99 and when \code{"var"} is used, the default \code{"value"} is set to 0.01.} \item{return_projection}{a logical indicating if the projection(s) must be @@ -211,11 +211,11 @@ used to reduce the size of the reference set before before running the \code{\link{mbl}} function. This function uses the \code{\link{dissimilarity}} fucntion to compute the -dissimilarities between code{Xr} and \code{Xu}. Arguments to +dissimilarities between \code{Xr} and \code{Xu}. Arguments to \code{\link{dissimilarity}} as well as further arguments to the functions used inside \code{\link{dissimilarity}} (i.e. \code{\link{ortho_diss}} \code{\link{cor_diss}} \code{\link{f_diss}} \code{\link{sid}}) can be passed to -those functions by using \code{...}. +those functions as additional arguments (i.e. \code{...}). } \examples{ \dontrun{ @@ -297,5 +297,5 @@ metrics for use with soil vis-NIR spectra. Geoderma 199, 43-53. \code{\link{mbl}} } \author{ -Leonardo Ramirez-Lopez +\href{https://orcid.org/0000-0002-5369-5120}{Leonardo Ramirez-Lopez} } diff --git a/man/sid.Rd b/man/sid.Rd index 3202887..c1cf5a1 100644 --- a/man/sid.Rd +++ b/man/sid.Rd @@ -86,7 +86,7 @@ a \code{list} with the following components: \lifecycle{experimental} \loadmathjax -This function computes the spectral information divergence (distance) between +This function computes the spectral information divergence/dissimilarity between spectra based on the kullback-leibler divergence algorithm (see details). } \details{ @@ -104,9 +104,9 @@ Kullback-Leibler divergence (\mjeqn{KL}{KL}) or relative entropy by each spectrum. The SID between two spectra (\mjeqn{x_{i}}{x_i} and \mjeqn{x_{j}}{x_j}) is computed as follows: -\mjdeqn{SID(x_{i},x_{j}) = KL(x_{i} \left |\right | x_{j}) + KL(x_{j} \left |\right | x_{i})}{SID(x_i,x_j) = KL(x_i |\ | x_j) + KL(x_j |\ | x_i)} +\mjdeqn{sid(x_{i},x_{j}) = KL(x_{i} \left |\right | x_{j}) + KL(x_{j} \left |\right | x_{i})}{sid(x_i,x_j) = KL(x_i |\ | x_j) + KL(x_j |\ | x_i)} -\mjdeqn{SID(x_{i},x_{j}) = \sum_{l=1}^{k} p_l \ log(\frac{p_l}{q_l}) + \sum_{l=1}^{k} q_l \ log(\frac{q_l}{p_l})}{SID(x_i,x_j) = \sum_{l=1}^{k} p_l \ log(p_l/q_l) + \sum_{l=1}^{k} q_l \ log(q_l/p_l)} +\mjdeqn{sid(x_{i},x_{j}) = \sum_{l=1}^{k} p_l \ log(\frac{p_l}{q_l}) + \sum_{l=1}^{k} q_l \ log(\frac{q_l}{p_l})}{sid(x_i,x_j) = \sum_{l=1}^{k} p_l \ log(p_l/q_l) + \sum_{l=1}^{k} q_l \ log(q_l/p_l)} where \mjeqn{k}{k} represents the number of variables or spectral features, \mjeqn{p}{p} and \mjeqn{q}{q} are the probability vectors of \mjeqn{x_{i}}{x_i} and diff --git a/man/sim_eval.Rd b/man/sim_eval.Rd index 3942bde..3e05671 100644 --- a/man/sim_eval.Rd +++ b/man/sim_eval.Rd @@ -54,12 +54,16 @@ is used for assessing the similarity between the observations and their corresponding most similar observations in terms of the side information provided. It is computed as follows: -\mjdeqn{RMSD = \sqrt{\frac{1}{n} \sum_{i=1}^n {(y_i - \ddot{y}_i)^2}}}{RMSD = \sqrt{1/n sum_{i=1}^n (y_i - ddot y_i)^2}} +\mjdeqn{j(i) = NN(xr_i, Xr^{\{-i\}})}{j(i) = NN(xr_i, Xr^{\{-i\}})} +\mjdeqn{RMSD = \sqrt{\frac{1}{m} \sum_{i=1}^n {(y_i - y_{j(i)})^2}}}{RMSD = \sqrt{1/n sum_{i=1}^m (y_i - y_{j(i)})^2}} -where \mjeqn{y_{i}}{y_i} is the value of the side variable of the \mjeqn{i}{i}th -observation, \mjeqn{\ddot{y}_i}{ddot y_i} is the value of the side variable -of the nearest neighbor of the \mjeqn{i}{i}th observation and \mjeqn{n}{n} -is the total number of observations. +where \mjeqn{NN(xr_i, Xr^{-i})}{NN(xr_i, Xr^{-i})} represents a function to +obtain the index of the nearest neighbor observation found in \mjeqn{Xr}{Xr} +(excluding the \mjeqn{i}{i}th observation) for \mjeqn{xr_i}{xr_i}, +\mjeqn{y_{i}}{y_i} is the value of the side variable of the \mjeqn{i}{i}th +observation, \mjeqn{y_{j(i)}}{y_{j(i)}} is the value of the side variable of +the nearest neighbor of the \mjeqn{i}{i}th observation and \mjeqn{m}{m} is +the total number of observations. If \code{side_info} is a factor the kappa index (\mjeqn{\kappa}{kappa}) is used instead the RMSD. It is computed as follows: @@ -161,5 +165,5 @@ Dematte, J. A. M., Scholten, T. 2013b. Distance and similarity-search metrics for use with soil vis-NIR spectra. Geoderma 199, 43-53. } \author{ -Leonardo Ramirez-Lopez +\href{https://orcid.org/0000-0002-5369-5120}{Leonardo Ramirez-Lopez} } diff --git a/man/sub-.local_ortho_diss.Rd b/man/sub-.local_ortho_diss.Rd index d700a8f..4afe807 100644 --- a/man/sub-.local_ortho_diss.Rd +++ b/man/sub-.local_ortho_diss.Rd @@ -18,6 +18,6 @@ \item{...}{not used} } \description{ -prints the subsets of localortho_diss objects +prints the subsets of local_ortho_diss objects } \keyword{internal} diff --git a/man/which_min_vector.Rd b/man/which_min_vector.Rd new file mode 100644 index 0000000..c58a417 --- /dev/null +++ b/man/which_min_vector.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RcppExports.R +\name{which_min_vector} +\alias{which_min_vector} +\title{A function to compute indices of minimum values of a distance vector} +\usage{ +which_min_vector(X) +} +\arguments{ +\item{X}{a vector of distances} +} +\value{ +a vector of the indices of the nearest neighbors +} +\description{ +For internal use only +} +\details{ +Used internally to find the nearest neighbors. +It searches in lower (or upper) triangular matrix. Therefore this must be the format of the +input data. The piece of code int \code{len = (sqrt(X.size()*8+1)+1)/2} generated an error in CRAN +since \code{sqrt} cannot be applied to integers. +} +\author{ +Antoine Stevens +} +\keyword{internal} diff --git a/src/fast_diss.cpp b/src/fast_diss.cpp index 6e20117..f9a3109 100644 --- a/src/fast_diss.cpp +++ b/src/fast_diss.cpp @@ -65,7 +65,7 @@ NumericVector fast_diss_vector(NumericVector X){ int n = ((nX*nX)-nX)/2; NumericVector output(n); #if defined(_OPENMP) - #pragma omp parallel for schedule(dynamic) +#pragma omp parallel for schedule(dynamic) #endif for(int i = 0; i < nX-1; i++) for(int j = i+1; j < nX; j++){ @@ -149,7 +149,7 @@ NumericVector minDissV(NumericVector X){ arma::uvec vindex(len); int i,j; #if defined(_OPENMP) - #pragma omp parallel for private(i,j) schedule(dynamic) +#pragma omp parallel for private(i,j) schedule(dynamic) #endif for(int i = 0; i < nX - 1; i++) for(int j = i + 1; j < nX; j++){ diff --git a/src/registerDynamicSymbol.c b/src/registerDynamicSymbol.c deleted file mode 100644 index ad20359..0000000 --- a/src/registerDynamicSymbol.c +++ /dev/null @@ -1,10 +0,0 @@ -// RegisteringDynamic Symbols - -#include -#include -#include - -void R_init_markovchain(DllInfo* info) { - R_registerRoutines(info, NULL, NULL, NULL, NULL); - R_useDynamicSymbols(info, TRUE); -} diff --git a/src/regression_methods.cpp b/src/regression_methods.cpp index b3ea8f5..bc13cce 100644 --- a/src/regression_methods.cpp +++ b/src/regression_methods.cpp @@ -16,7 +16,6 @@ using namespace Rcpp; //' @usage get_col_largest_sd(X) //' @param X a matrix. //' @return a value indicating the index of the column with the largest standard deviation. -//' @useDynLib resemble //' @author Leonardo Ramirez-Lopez //' @keywords internal //' @useDynLib resemble @@ -48,7 +47,6 @@ NumericVector get_col_largest_sd(arma::mat X){ //' @usage get_column_sds(X) //' @param X a a matrix. //' @return a vector of standard deviation values. -//' @useDynLib resemble //' @author Leonardo Ramirez-Lopez //' @keywords internal //' @useDynLib resemble @@ -63,7 +61,6 @@ NumericVector get_column_sds(arma::mat X){ //' @usage get_column_means(X) //' @param X a a matrix. //' @return a vector of mean values. -//' @useDynLib resemble //' @author Leonardo Ramirez-Lopez //' @keywords internal //' @useDynLib resemble @@ -78,7 +75,6 @@ NumericVector get_column_means(arma::mat X){ //' @usage get_column_sums(X) //' @param X a matrix. //' @return a vector of standard deviation values. -//' @useDynLib resemble //' @author Leonardo Ramirez-Lopez //' @keywords internal //' @useDynLib resemble @@ -97,8 +93,8 @@ NumericVector get_column_sums(arma::mat X){ //' @usage //' opls_for_projection(X, Y, ncomp, scale, //' maxiter, tol, -//' pcSelmethod = "cumvar", -//' pcSelvalue = 0.99) +//' pcSelmethod = "var", +//' pcSelvalue = 0.01) //' @param X a matrix of predictor variables. //' @param Y a matrix of either a single or multiple response variables. //' @param ncomp the number of pls components. @@ -139,7 +135,6 @@ NumericVector get_column_sums(arma::mat X){ //' and \code{Xscale}}. //' \item{\code{weights}}{ the matrix of wheights.} //' } -//' @useDynLib resemble //' @author Leonardo Ramirez-Lopez //' @keywords internal //' @useDynLib resemble @@ -150,8 +145,8 @@ List opls_for_projection(arma::mat X, bool scale, double maxiter, double tol, - String pcSelmethod = "cumvar", - double pcSelvalue = 0.99 + String pcSelmethod = "var", + double pcSelvalue = 0.01 ){ int ny = Y.n_cols; @@ -252,8 +247,9 @@ List opls_for_projection(arma::mat X, Xloadings.row(i) = trans(p); Yloadings.row(i) = trans(q); explained_var(0,i) = arma::var(scores.col(i)); - explained_var(1,i) = sum(explained_var.row(0)) / xvar; - explained_var(2,i) = explained_var(0,i)/xvar; + explained_var(1,i) = explained_var(0,i) / xvar; + explained_var(2,i) = sum(explained_var.row(0)) / xvar; + ith_comp = ith_comp + 1; @@ -261,9 +257,9 @@ List opls_for_projection(arma::mat X, if (pcSelmethod == "var" || pcSelmethod == "cumvar") { bool chk; if (pcSelmethod == "cumvar") { - chk = explained_var(1,i) > pcSelvalue; + chk = explained_var(2,i) > pcSelvalue; } else { - chk = explained_var(2,i) < pcSelvalue; + chk = explained_var(1,i) < pcSelvalue; } if (chk) { ncomp = ith_comp - 1; @@ -271,19 +267,26 @@ List opls_for_projection(arma::mat X, if(i == 0) { throw exception("With the current value in the 'pc_selection' argument, no components are selected. Try another value."); } + if (pcSelmethod == "cumvar") { + ncomp = ncomp + 1; + } break; } } } } + + + arma::uvec pc_indices; if (pcSelmethod != "manual") { if (pcSelmethod == "var" || pcSelmethod == "cumvar") { if (pcSelmethod == "var") { - pc_indices = find(explained_var.row(2) >= pcSelvalue); + pc_indices = find(explained_var.row(1) >= pcSelvalue); } else { - pc_indices = find(explained_var.row(1) <= pcSelvalue && explained_var.row(1) > 0); + //pc_indices = find(explained_var.row(2) <= pcSelvalue && explained_var.row(2) > 0); + pc_indices = find(explained_var.row(2) > 0); } weights = weights.rows(pc_indices); coefficients = coefficients.cols(pc_indices); @@ -385,7 +388,6 @@ List opls_for_projection(arma::mat X, //' These objects contain information on the explained variance for the \code{X} and \code{Y} matrices respectively.} //' \item{\code{transf}}{ a \code{list} conating two objects: \code{Xcenter} and \code{Xscale}}. //' \item{\code{weights}}{ the matrix of wheights.}} -//' @useDynLib resemble //' @author Leonardo Ramirez-Lopez //' @keywords internal //' @useDynLib resemble @@ -410,15 +412,16 @@ List opls_get_all(arma::mat X, arma::mat yex = arma::zeros(ny, ncomp); arma::mat Xscale; arma::mat x_scale_vec; + arma::mat x_center_vec; arma::mat Xz = X; - if(scale){ + if (scale) { Xscale = arma::repmat(Rcpp::as(get_column_sds(Xz)), Xz.n_rows, 1); Xz = Xz / Xscale; x_scale_vec = Xscale.row(0); } - x_scale_vec = Rcpp::as(get_column_means(Xz)); - Xz = Xz - arma::repmat(x_scale_vec, Xz.n_rows, 1); + x_center_vec = Rcpp::as(get_column_means(Xz)); + Xz = Xz - arma::repmat(x_center_vec, Xz.n_rows, 1); arma::mat Xpls = Xz; arma::mat Ypls = Y; @@ -490,8 +493,8 @@ List opls_get_all(arma::mat X, Xloadings.row(i) = trans(p); Yloadings.row(i) = trans(q); explained_var(0,i) = arma::var(scores.col(i)); - explained_var(1,i) = sum(explained_var.row(0)) / xvar; - explained_var(2,i) = explained_var(0,i)/xvar; + explained_var(1,i) = explained_var(0,i)/xvar; + explained_var(2,i) = sum(explained_var.row(0)) / xvar; } // convert this to standard deviation explained_var.row(0) = sqrt(explained_var.row(0)); @@ -542,7 +545,7 @@ List opls_get_all(arma::mat X, expvar = arma::var(expvar, 0, 0); sratio.row(j) = expvar/resvar ; // compute the intercept - y_hat_mean = x_scale_vec * coefficients.col(idx); + y_hat_mean = x_center_vec * coefficients.col(idx); y_hat_mean_vec = arma::vectorise(y_hat_mean); bo(k, j) = ymean_vec(k) - y_hat_mean_vec(0); idx = idx + 1; @@ -584,7 +587,7 @@ List opls_get_all(arma::mat X, Rcpp::Named("y_var") = yex ), Rcpp::Named("transf") = Rcpp::List::create( - Rcpp::Named("Xcenter") = x_scale_vec, + Rcpp::Named("Xcenter") = x_center_vec, Rcpp::Named("Xscale") = x_scale_vec ), _["weights"] = weights @@ -620,7 +623,6 @@ List opls_get_all(arma::mat X, //' \item{\code{Y}}{ the \code{Y} input.} //' \item{\code{transf}}{ a \code{list} conating two objects: \code{Xcenter} and \code{Xscale}}. //' \item{\code{weights}}{ the matrix of wheights.}} -//' @useDynLib resemble //' @author Leonardo Ramirez-Lopez //' @keywords internal //' @useDynLib resemble @@ -643,6 +645,7 @@ List opls(arma::mat X, arma::mat bo = arma::zeros(ny, ncomp); arma::mat Xscale; arma::mat x_scale_vec; + arma::mat x_center_vec; arma::mat Xz = X; if (scale) { @@ -650,8 +653,8 @@ List opls(arma::mat X, Xz = Xz / Xscale; x_scale_vec = Xscale.row(0); } - x_scale_vec = Rcpp::as(get_column_means(Xz)); - Xz = Xz - arma::repmat(x_scale_vec, Xz.n_rows, 1); + x_center_vec = Rcpp::as(get_column_means(Xz)); + Xz = Xz - arma::repmat(x_center_vec, Xz.n_rows, 1); // matrices to declare arma::mat Xpls = Xz; @@ -731,7 +734,7 @@ List opls(arma::mat X, arma::mat jth_loading = Yloadings.col(k); for (int j = 0; j < ncomp; j++) { coefficients.col(idx) = projection_matrix.cols(0, j) * jth_loading.rows(0, j); - y_hat_mean = x_scale_vec * coefficients.col(idx); + y_hat_mean = x_center_vec * coefficients.col(idx); y_hat_mean_vec = arma::vectorise(y_hat_mean); bo(k, j) = ymean_vec(k) - y_hat_mean_vec(0); idx = idx + 1; @@ -748,7 +751,7 @@ List opls(arma::mat X, Rcpp::Named("projection_mat") = projection_matrix, Rcpp::Named("Y") = Y, Rcpp::Named("transf") = Rcpp::List::create( - Rcpp::Named("Xcenter") = x_scale_vec, + Rcpp::Named("Xcenter") = x_center_vec, Rcpp::Named("Xscale") = x_scale_vec ), _["weights"] = weights @@ -778,7 +781,6 @@ List opls(arma::mat X, //' \item{\code{projection_mat}}{ the projection matrix.} //' \item{\code{transf}}{ a \code{list} conating two objects: \code{Xcenter} and \code{Xscale}}. //' } -//' @useDynLib resemble //' @author Leonardo Ramirez-Lopez //' @keywords internal //' @useDynLib resemble @@ -926,7 +928,6 @@ List opls_get_basics(arma::mat X, //' @param scale a logical indicating whether the matrix of predictors used to create the regression model was scaled. //' @param Xscale if \code{scale = TRUE} a matrix of one row with the values that must be used for scaling \code{newdata}. //' @return a matrix of predicted values. -//' @useDynLib resemble //' @author Leonardo Ramirez-Lopez //' @keywords internal //' @useDynLib resemble @@ -938,14 +939,15 @@ Rcpp::NumericMatrix predict_opls(arma::mat bo, bool scale, arma::mat Xscale ){ - arma::mat Xz; - + if (scale) { - Xz = newdata / arma::repmat(Xscale, newdata.n_rows, 1); - } else { - Xz = newdata; - } - arma::mat predicted = (Xz * b.cols(0, ncomp - 1)) + arma::repmat(bo.cols(0, ncomp - 1), Xz.n_rows, 1); + newdata = newdata / arma::repmat(Xscale, newdata.n_rows, 1); + } + + // Not Necessary to center since b0 is used + // Xz = Xz - arma::repmat(Xcenter, newdata.n_rows, 1); + + arma::mat predicted = (newdata * b.cols(0, ncomp - 1)) + arma::repmat(bo.cols(0, ncomp - 1), newdata.n_rows, 1); return Rcpp::wrap(predicted); } @@ -962,17 +964,16 @@ Rcpp::NumericMatrix predict_opls(arma::mat bo, //' @param Xscale if \code{scale = TRUE} a matrix of one row with the values that must be used for scaling \code{newdata}. //' @param Xcenter a matrix of one row with the values that must be used for centering \code{newdata}. //' @return a matrix corresponding to the new spectra projected onto the PLS space -//' @useDynLib resemble //' @author Leonardo Ramirez-Lopez //' @keywords internal //' @useDynLib resemble // [[Rcpp::export]] Rcpp::NumericMatrix project_opls(arma::mat projection_mat, - int ncomp, - arma::mat newdata, - bool scale, - arma::mat Xcenter, - arma::mat Xscale + int ncomp, + arma::mat newdata, + bool scale, + arma::mat Xcenter, + arma::mat Xscale ){ if(scale){ @@ -994,7 +995,6 @@ Rcpp::NumericMatrix project_opls(arma::mat projection_mat, //' @param projection_mat the projection matrix generated by the \code{opls_get_basics} function. //' @param xloadings the loadings matrix generated by the \code{opls_get_basics} function. //' @return a matrix of 1 row and 1 column. -//' @useDynLib resemble //' @author Leonardo Ramirez-Lopez //' @keywords internal //' @useDynLib resemble @@ -1033,7 +1033,6 @@ Rcpp::NumericMatrix reconstruction_error(arma::mat x, //' @param Xcenter a matrix of one row with the values that must be used for centering \code{newdata}. //' @param Xscale if \code{scale = TRUE} a matrix of one row with the values that must be used for scaling \code{newdata}. //' @return a matrix of one row with the weights for each component between the max. and min. specified. -//' @useDynLib resemble //' @author Leonardo Ramirez-Lopez //' @keywords internal //' @useDynLib resemble @@ -1065,7 +1064,7 @@ Rcpp::NumericMatrix get_pls_weights(arma::mat projection_mat, arma::mat xrec = sc.cols(0,i) * xloadings.rows(0,i); xrmsres.col(i) = sqrt(arma::mean(arma::mean(pow(Xz - xrec, 2), 0), 1)); } - + arma::mat rmsb = sqrt(get_column_means(pow(coefficients.cols(0, max_component - 1), 2))); arma::mat rmsb_x = trans(rmsb.rows(min_component - 1, max_component - 1)) % xrmsres.cols(min_component - 1, max_component - 1); arma::mat whgtn = pow(rmsb_x, -1); @@ -1103,23 +1102,22 @@ Rcpp::NumericMatrix get_pls_weights(arma::mat projection_mat, //' \item{\code{st_rmse_seg}}{ the standardized RMSEs.} //' \item{\code{rsq_seg}}{ the coefficients of determination.} //' } -//' @useDynLib resemble //' @author Leonardo Ramirez-Lopez //' @keywords internal //' @useDynLib resemble // [[Rcpp::export]] List opls_cv_cpp(arma::mat X, - arma::mat Y, - bool scale, - String method, - arma::mat mindices, - arma::mat pindices, - int min_component, - int ncomp, - arma::mat new_x, - double maxiter, - double tol, - arma::mat wapls_grid + arma::mat Y, + bool scale, + String method, + arma::mat mindices, + arma::mat pindices, + int min_component, + int ncomp, + arma::mat new_x, + double maxiter, + double tol, + arma::mat wapls_grid ){ arma::mat rmseseg; arma::mat strmseseg; @@ -1170,10 +1168,10 @@ List opls_cv_cpp(arma::mat X, ypred = Rcpp::as(predict_opls(fit["bo"], fit["coefficients"], - ncomp, - pxmatslice, - scale, - transf["Xscale"])); + ncomp, + pxmatslice, + scale, + transf["Xscale"])); arma::mat rdl = sqrt(get_column_means(pow(rpymatslice - ypred, 2))); rmseseg.col(i) = rdl; @@ -1196,14 +1194,14 @@ List opls_cv_cpp(arma::mat X, compweights = arma::zeros(1, ncomp); compweights.cols(min_component-1, ncomp-1) = Rcpp::as(get_pls_weights(cfit["projection_mat"], - cfit["X_loadings"], - cfit["coefficients"], - new_x, - min_component, - ncomp, - scale, - ctransf["Xcenter"], - ctransf["Xscale"])); + cfit["X_loadings"], + cfit["coefficients"], + new_x, + min_component, + ncomp, + scale, + ctransf["Xcenter"], + ctransf["Xscale"])); arma::mat rcompweights = arma::repmat(compweights, pindices.n_rows, 1); @@ -1243,10 +1241,10 @@ List opls_cv_cpp(arma::mat X, nypred = Rcpp::as(predict_opls(fit["bo"], fit["coefficients"], - ncomp, - pxmatslice, - scale, - transf["Xscale"])); + ncomp, + pxmatslice, + scale, + transf["Xscale"])); ypred = arma::sum(rcompweights % nypred, 1); arma::mat rdl = sqrt(get_column_means(pow(pymatslice - ypred, 2))); @@ -1269,14 +1267,14 @@ List opls_cv_cpp(arma::mat X, compweights = arma::zeros(1, ncomp); compweights.cols(min_component-1, ncomp-1) = Rcpp::as(get_pls_weights(cfit["projection_mat"], - cfit["X_loadings"], - cfit["coefficients"], - new_x, - min_component, - ncomp, - scale, - ctransf["Xcenter"], - ctransf["Xscale"])); + cfit["X_loadings"], + cfit["coefficients"], + new_x, + min_component, + ncomp, + scale, + ctransf["Xcenter"], + ctransf["Xscale"])); crcompweights = arma::zeros(wapls_grid.n_rows, ncomp); for(int i = 0; (unsigned)i < crcompweights.n_rows; i++){ @@ -1329,10 +1327,10 @@ List opls_cv_cpp(arma::mat X, nypred = (Rcpp::as(predict_opls(fit["bo"], fit["coefficients"], - ncomp, - pxmatslice, - scale, - transf["Xscale"]))); + ncomp, + pxmatslice, + scale, + transf["Xscale"]))); ypred = nypred * trans(crcompweights); @@ -1379,15 +1377,14 @@ List opls_cv_cpp(arma::mat X, //' \item{\code{Ycenter}}{ if matrix of predictors was scaled, the centering vector used for \code{Y}.} //' \item{\code{Yscale}}{ if matrix of predictors was scaled, the scaling vector used for \code{Y}.} //' } -//' @useDynLib resemble //' @author Leonardo Ramirez-Lopez //' @keywords internal //' @useDynLib resemble // [[Rcpp::export]] List gaussian_process(arma::mat X, - arma::mat Y, - float noisev = 0.001, - bool scale = true + arma::mat Y, + float noisev = 0.001, + bool scale = true ){ // matrices to declare @@ -1454,19 +1451,18 @@ List gaussian_process(arma::mat X, //' @param Ycenter if \code{center = TRUE} a matrix of one row with the values that must be used for accounting for the centering of the response variable. //' @param Yscale if \code{scale = TRUE} a matrix of one row with the values that must be used for accounting for the scaling of the response variable. //' @return a matrix of predicted values -//' @useDynLib resemble //' @author Leonardo Ramirez-Lopez //' @keywords internal //' @useDynLib resemble // [[Rcpp::export]] NumericVector predict_gaussian_process(arma::mat Xz, - arma::mat alpha, - arma::mat newdata, - bool scale, - arma::mat Xcenter, - arma::mat Xscale, - arma::mat Ycenter, - arma::mat Yscale + arma::mat alpha, + arma::mat newdata, + bool scale, + arma::mat Xcenter, + arma::mat Xscale, + arma::mat Ycenter, + arma::mat Yscale ){ arma::mat newdatatr = newdata; @@ -1506,17 +1502,16 @@ NumericVector predict_gaussian_process(arma::mat Xz, //' \item{\code{st.rmse.seg}}{ the standardized RMSEs.} //' \item{\code{rsq.seg}}{ the coefficients of determination.} //' } -//' @useDynLib resemble //' @author Leonardo Ramirez-Lopez //' @keywords internal //' @useDynLib resemble // [[Rcpp::export]] List gaussian_process_cv(arma::mat X, - arma::mat Y, - arma::mat mindices, - arma::mat pindices, - float noisev = 0.001, - bool scale = true + arma::mat Y, + arma::mat mindices, + arma::mat pindices, + float noisev = 0.001, + bool scale = true ){ arma::mat rmseseg = arma::zeros(1, mindices.n_cols); @@ -1578,8 +1573,8 @@ List gaussian_process_cv(arma::mat X, //' @usage //' pca_nipals(X, ncomp, center, scale, //' maxiter, tol, -//' pcSelmethod = "cumvar", -//' pcSelvalue = 0.99) +//' pcSelmethod = "var", +//' pcSelvalue = 0.01) //' @param X a matrix of predictor variables. //' @param Y a matrix of either a single or multiple response variables. //' @param ncomp the number of pls components. @@ -1589,13 +1584,13 @@ List gaussian_process_cv(arma::mat X, //' @param pcSelmethod the method for selecting the number of components. //' Options are: \code{'cumvar'} (for selecting the number of principal components based on a given //' cumulative amount of explained variance) and \code{"var"} (for selecting the number of principal -//' components based on a given amount of explained variance). Default is \code{'cumvar'} +//' components based on a given amount of explained variance). Default is \code{'var'} //' @param pcSelvalue a numerical value that complements the selected method (\code{pcSelmethod}). //' If \code{"cumvar"} is chosen, it must be a value (larger than 0 and below 1) indicating the maximum //' amount of cumulative variance that the retained components should explain. If \code{"var"} is chosen, //' it must be a value (larger than 0 and below 1) indicating that components that explain (individually) //' a variance lower than this threshold must be excluded. If \code{"manual"} is chosen, it must be a value -//' specifying the desired number of principal components to retain. Default is 0.99. +//' specifying the desired number of principal components to retain. Default is 0.01. //' @return a list containing the following elements: //' \itemize{ //' \item{\code{pc_scores}}{ a matrix of principal component scores.} @@ -1603,7 +1598,6 @@ List gaussian_process_cv(arma::mat X, //' \item{\code{variance}}{ a matrix of the variance of the principal components.} //' \item{\code{scale}}{ a \code{list} conating two objects: \code{center} and \code{scale}, which correspond to the vectors used to center and scale the input matrix.} //' } -//' @useDynLib resemble //' @author Leonardo Ramirez-Lopez //' @keywords internal //' @useDynLib resemble @@ -1614,8 +1608,8 @@ List pca_nipals(arma::mat X, bool scale, double maxiter, double tol, - String pcSelmethod = "cumvar", - double pcSelvalue = 0.99 + String pcSelmethod = "var", + double pcSelvalue = 0.01 ){ arma::mat Xscale; @@ -1673,18 +1667,18 @@ List pca_nipals(arma::mat X, pc_scores.col(i) = tt.col(0); pc_loadings.col(i) = pp_std.col(0); explained_var(0,i) = arma::var(pc_scores.col(i)); - explained_var(1,i) = sum(explained_var.row(0)) / xvar; - explained_var(2,i) = explained_var(0,i)/xvar; + explained_var(1,i) = explained_var(0,i) / xvar; + explained_var(2,i) = sum(explained_var.row(0)) / xvar; ith_comp = ith_comp + 1; if(pcSelmethod == "var" || pcSelmethod == "cumvar") { bool chk; if(pcSelmethod == "cumvar"){ - chk = explained_var(1,i) > pcSelvalue; + chk = explained_var(2,i) > pcSelvalue; } else{ - chk = explained_var(2,i) < pcSelvalue; + chk = explained_var(1,i) < pcSelvalue; } if(chk) { @@ -1702,7 +1696,7 @@ List pca_nipals(arma::mat X, arma::uvec pc_indices; if(pcSelmethod == "var") { - pc_indices = find(explained_var.row(2) >= pcSelvalue); + pc_indices = find(explained_var.row(1) >= pcSelvalue); pc_scores = pc_scores.cols(pc_indices); pc_loadings = pc_loadings.cols(pc_indices); explained_var = explained_var.cols(pc_indices); @@ -1710,12 +1704,14 @@ List pca_nipals(arma::mat X, if(pcSelmethod == "cumvar") { - pc_indices = find(explained_var.row(1) <= pcSelvalue && explained_var.row(1) > 0); + //pc_indices = find(explained_var.row(2) <= pcSelvalue && explained_var.row(2) > 0); + pc_indices = find(explained_var.row(2) > 0); pc_scores = pc_scores.cols(pc_indices); pc_loadings = pc_loadings.cols(pc_indices); explained_var = explained_var.cols(pc_indices); } - + // convert this to standard deviation + explained_var.row(0) = sqrt(explained_var.row(0)); return Rcpp::List::create( Rcpp::Named("pc_indices") = pc_indices, diff --git a/src/which_min.cpp b/src/which_min.cpp index 8810c97..e0a8331 100644 --- a/src/which_min.cpp +++ b/src/which_min.cpp @@ -34,48 +34,3 @@ NumericVector which_min(NumericMatrix X){ } return wrap(vindex +1); } - -//' @title A function to compute indices of minimum values of a distance vector -//' @description For internal use only -//' @usage -//' which_minV(X,cores) -//' @param X a vector of distance (as computed in \code{resemble:::fastDistVV} or \code{base::dist}) -//' @return a vector of the indices of the nearest neighbors -//' @details -//' Used internally to find the nearest neighbors. -//' It searches in lower (or upper?) trianguular matrix. Therefore this must be the format of the -//' input data. The piece of code int \code{len = (sqrt(X.size()*8+1)+1)/2} generated an error in CRAN -//' since \code{sqrt} cannot be applied to integers. -//' @keywords internal -//' @useDynLib resemble -//' @author Antoine Stevens -// [[Rcpp::plugins(openmp)]] -// [[Rcpp::export]] -NumericVector which_min_vector(NumericVector X){ - arma::uword index; - double vct = (sqrt(((double)X.size()) * 8.0 + 1.0) + 1.0) / 2.0; - int len = (int)vct; - // int len = (sqrt(X.size()*8+1)+1)/2; - arma::uvec vindex(len); - int i,j; -#if defined(_OPENMP) - #pragma omp parallel for private(i,j) schedule(dynamic) -#endif - for(i = 0; i < len; i++){ - arma::vec x(len); - for(j = 0; j < i; j++){ - // triangular sequence - int k = j * len - (j * (j + 3) / 2) + i - 1; - x[j] = X(k); - } - for(j = i+1; j < len; j++){ - // triangular sequence - int k2 = i * len - (i * (i + 3) / 2) + j - 1; - x[j] = X(k2); - } - x[i] = arma::datum::nan; // remove diag - x.min(index); // don't assign result to a value since we are interested only in the index - vindex[i] = index; - } - return wrap(vindex +1); -} diff --git a/src/which_min_vector.cpp b/src/which_min_vector.cpp new file mode 100644 index 0000000..a5a3263 --- /dev/null +++ b/src/which_min_vector.cpp @@ -0,0 +1,52 @@ +#include +#ifdef _OPENMP +#include // OpenMP +#endif + +using namespace Rcpp; +// [[Rcpp::plugins(openmp)]] +// [[Rcpp::depends(RcppArmadillo)]] + +//' @title A function to compute indices of minimum values of a distance vector +//' @description For internal use only +//' @usage +//' which_min_vector(X) +//' @param X a vector of distances +//' @return a vector of the indices of the nearest neighbors +//' @details +//' Used internally to find the nearest neighbors. +//' It searches in lower (or upper) triangular matrix. Therefore this must be the format of the +//' input data. The piece of code int \code{len = (sqrt(X.size()*8+1)+1)/2} generated an error in CRAN +//' since \code{sqrt} cannot be applied to integers. +//' @keywords internal +//' @useDynLib resemble +//' @author Antoine Stevens +// [[Rcpp::export]] +NumericVector which_min_vector(NumericVector X){ + arma::uword index; + double vct = (sqrt(((double)X.size()) * 8.0 + 1.0) + 1.0) / 2.0; + int len = (int)vct; + // int len = (sqrt(X.size()*8+1)+1)/2; + arma::uvec vindex(len); + int i,j; +#if defined(_OPENMP) + #pragma omp parallel for private(i,j) schedule(dynamic) +#endif + for(i = 0; i < len; i++){ + arma::vec x(len); + for(j = 0; j < i; j++){ + // triangular sequence + int k = j * len - (j * (j + 3) / 2) + i - 1; + x[j] = X(k); + } + for(j = i+1; j < len; j++){ + // triangular sequence + int k2 = i * len - (i * (i + 3) / 2) + j - 1; + x[j] = X(k2); + } + x[i] = arma::datum::nan; // remove diag + x.min(index); // don't assign result to a value since we are interested only in the index + vindex[i] = index; + } + return wrap(vindex +1); +} diff --git a/tests/testthat/test-pc_projection.R b/tests/testthat/test-pc_projection.R index e5d6dc2..517f138 100644 --- a/tests/testthat/test-pc_projection.R +++ b/tests/testthat/test-pc_projection.R @@ -4,21 +4,21 @@ test_that("pc_projection works", { # tolernce for results supposed to be 0s tol <- 1e-5 nirdata <- data("NIRsoil", package = "prospectr") - + Xu <- NIRsoil$spc[!as.logical(NIRsoil$train), ] Yu <- NIRsoil$CEC[!as.logical(NIRsoil$train)] - + Yr <- NIRsoil$CEC[as.logical(NIRsoil$train)] Yr_2 <- NIRsoil$Ciso[as.logical(NIRsoil$train)] Xr <- NIRsoil$spc[as.logical(NIRsoil$train), ] - + Xu <- Xu[!is.na(Yu), ] Xr <- Xr[!is.na(Yr), ] - + Yu <- Yu[!is.na(Yu)] Yr_2 <- Yr[!is.na(Yr)] Yr <- Yr[!is.na(Yr)] - + cumvar_value <- 0.999 one_input_matrix <- pc_projection(Xr, @@ -27,7 +27,8 @@ test_that("pc_projection works", { method = "pca") expect_true(ncol(one_input_matrix$scores) == one_input_matrix$n_components) - expect_true(all(one_input_matrix$variance[2,] < cumvar_value)) + test_ncomp <- one_input_matrix$n_components - 1 + expect_true(all(one_input_matrix$variance[3, 1:test_ncomp] < cumvar_value)) two_input_matrices <- pc_projection(Xr, Xu, pc_selection = list(method = "cumvar", value = cumvar_value), @@ -35,26 +36,27 @@ test_that("pc_projection works", { method = "pca") two_input_matrices_var <- pc_projection(Xr, Xu, - pc_selection = list(method = "var", value = 1 - cumvar_value), - center = TRUE, scale = FALSE, - method = "pca") + pc_selection = list(method = "var", value = 1 - cumvar_value), + center = TRUE, scale = FALSE, + method = "pca") expect_true(ncol(two_input_matrices$scores) == two_input_matrices$n_components) - expect_true(all(two_input_matrices$variance[2,] < cumvar_value)) + two_test_ncomp <- two_input_matrices$n_components - 1 + expect_true(all(two_input_matrices$variance[3, 1:two_test_ncomp] < cumvar_value)) preds <- sum(abs(predict(two_input_matrices)[1:nrow(Xr), ] - predict(two_input_matrices, Xr))) expect_true(preds < tol) - + opc_method <- pc_projection(Xr, Xu, Yr = Yr, pc_selection = list(method = "opc", value = 30), center = TRUE, scale = FALSE, method = "pca") opc_method_nipals <- pc_projection(Xr, Xu, Yr = Yr, - pc_selection = list(method = "opc", value = 30), - center = TRUE, scale = FALSE, - method = "pca.nipals") + pc_selection = list(method = "opc", value = 30), + center = TRUE, scale = FALSE, + method = "pca.nipals") expect_true(opc_method$n_components == which.min(opc_method$opc_evaluation[,2])) expect_true(opc_method$n_components == 20) @@ -68,26 +70,27 @@ test_that("pc_projection works", { y = opc_method$scores) expect_true(sum(1 - cor_equiv) < tol) - + # check that the number of components for method = "cumvar" is properly # obtained, this can be done with the results of opc_method as it selects more # components than in the "cumvar" test - expect_true(sum(opc_method$variance[2,] < cumvar_value) == two_input_matrices$n_components) + expect_true(sum(opc_method$variance[3, ] < cumvar_value) == two_input_matrices$n_components - 1) # do the same for method = "var" - expect_true(sum(opc_method$variance[3,] > (1 - cumvar_value)) == two_input_matrices_var$n_components) + expect_true(sum(opc_method$variance[2,] > (1 - cumvar_value)) == two_input_matrices_var$n_components) expect_true(ncol(two_input_matrices$scores) == two_input_matrices$n_components) - expect_true(all(two_input_matrices$variance[2,] < cumvar_value)) + test_ncomp <- two_input_matrices$n_components - 1 + expect_true(all(two_input_matrices$variance[3, 1:test_ncomp] < cumvar_value)) bb <- cbind(name_test_yr = Yr, Yr_2) - + opc_method_nipals <- pc_projection(Xr, Xu, Yr = bb, pc_selection = list(method = "opc", value = 30), center = TRUE, scale = FALSE, method = "pca.nipals") expect_true("rmsd_name_test_yr" %in% colnames(opc_method_nipals$opc_evaluation)) - + }) diff --git a/tests/testthat/test-pls_projection.R b/tests/testthat/test-pls_projection.R index fca0f52..ac2f6a5 100644 --- a/tests/testthat/test-pls_projection.R +++ b/tests/testthat/test-pls_projection.R @@ -25,8 +25,9 @@ test_that("pls_projection works", { pc_selection = list(method = "cumvar", value = cumvar_value), center = TRUE, scale = FALSE) + test_ncomp <- one_input_matrix$n_components - 1 expect_true(ncol(one_input_matrix$scores) == one_input_matrix$n_components) - expect_true(all(one_input_matrix$variance$x_var[2,] < cumvar_value)) + expect_true(all(one_input_matrix$variance$x_var[3, 1:test_ncomp] < cumvar_value)) two_input_matrices <- pls_projection(Xr, Xu, Yr, pc_selection = list(method = "cumvar", value = cumvar_value), @@ -36,9 +37,9 @@ test_that("pls_projection works", { pc_selection = list(method = "var", value = 1 - cumvar_value), center = TRUE, scale = FALSE) - + two_test_ncomp <- two_input_matrices$n_components - 1 expect_true(ncol(two_input_matrices$scores) == two_input_matrices$n_components) - expect_true(all(two_input_matrices$variance$x_var[2,] < cumvar_value)) + expect_true(all(two_input_matrices$variance$x_var[3, 1:two_test_ncomp] < cumvar_value)) preds <- sum(abs(predict(two_input_matrices)[1:nrow(Xr), ] - predict(two_input_matrices, Xr))) expect_true(preds < tol) @@ -53,13 +54,14 @@ test_that("pls_projection works", { # check that the number of components for method = "cumvar" is properly # obtained, this can be done with the results of opc_method as it selects more # components than in the "cumvar" test - expect_true(sum(opc_method$variance$x_var[2,] < cumvar_value) == two_input_matrices$n_components) + expect_true(sum(opc_method$variance$x_var[3,] < cumvar_value) == two_input_matrices$n_components - 1) # do the same for method = "var" - expect_true(sum(opc_method$variance$x_var[3,] > (1 - cumvar_value)) == two_input_matrices_var$n_components) + expect_true(sum(opc_method$variance$x_var[2,] > (1 - cumvar_value)) == two_input_matrices_var$n_components) expect_true(ncol(two_input_matrices$scores) == two_input_matrices$n_components) - expect_true(all(two_input_matrices$variance$x_var[2,] < cumvar_value)) + two_test_ncnomp <- two_input_matrices$n_components - 1 + expect_true(all(two_input_matrices$variance$x_var[3, 1:two_test_ncnomp] < cumvar_value)) opc_method <- pls_projection(Xr, Xu, Yr = Yr, pc_selection = list(method = "opc", value = 20),