Data Science Desktop Survival Guide
by Graham Williams |
|||||
Data Template |
library(janitor) # Cleanup: clean_names().
library(rattle) # Dataset: weatherAUS. library(magrittr) # Pipelines: %<>%. library(randomForest) # Imputation: na.roughfix(). library(dplyr) # Wrangling: rename_all(). dsname <- "weatherAUS" ds <- get(dsname) nobs <- nrow(ds) dim(ds) vnames <- names(ds) ds %<>% rename_all(normVarNames) names(vnames) <- names(ds) names(ds) vars <- names(ds) target <- "rain_tomorrow" vars <- c(target, vars) %>% unique() %>% rev() for (v in which(sapply(ds, is.factor))) levels(ds[[v]]) %<>% normVarNames() risk <- "risk_mm" id <- c("date", "location") ignore <- c(risk, id) vars <- setdiff(vars, ignore) inputs <- setdiff(vars, target) form <- formula(paste(target, "~ .")) ds[vars] %<>% na.roughfix() ## Error in na.roughfix.data.frame(.): na.roughfix only works for numeric or factor
SPLIT <- c(0.7, 0.15, 0.15)
tr <- sample(nobs, SPLIT[1]*nobs) tu <- nobs %>% seq_len() %>% setdiff(tr) %>% sample(SPLIT[2]*nobs) te <- nobs %>% seq_len() %>% setdiff(tr) %>% setdiff(tu) target.tr <- ds %>% slice(tr) %>% pull(target) target.tu <- ds %>% slice(tu) %>% pull(target) target.te <- ds %>% slice(te) %>% pull(target) if (!is.null(risk)) { risk.tr <- ds %>% slice(tr) %>% pull(risk) risk.tu <- ds %>% slice(tu) %>% pull(risk) risk.te <- ds %>% slice(te) %>% pull(risk) }
|