--- title: "Scrape Michigan Lakes" author: "Jemma Stachelek" date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Scrape Michigan Lakes} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r } library(wikilake) ``` ## Generate list of Michigan Lakes ### Get Wikipedia URL of Category ```{r category url, eval = FALSE} res <- WikipediR::page_info("en", "wikipedia", page = "Category:Lakes of Michigan") ``` ### Scrape lake names ```{r scrape names, eval = FALSE} res <- xml2::read_html(res$query$pages[[1]]$canonicalurl) res <- rvest::html_nodes(res, "#mw-pages .mw-category") res <- rvest::html_nodes(res, "li") res <- rvest::html_nodes(res, "a") res <- rvest::html_attr(res, "title") ``` ### Remove junk names ```{r remove junk names, eval = FALSE} res <- res[!(seq_len(length(res)) %in% grep("List", res))] res <- res[!(seq_len(length(res)) %in% grep("Watershed", res))] res <- res[!(seq_len(length(res)) %in% grep("lakes", res))] res <- res[!(seq_len(length(res)) %in% grep("Mud Lake", res))] ``` ### Scrape tables ```{r scrape tables, eval = FALSE} res <- lapply(res, lake_wiki) # remove missing coordinates res <- res[unlist(lapply(res, function(x) !is.na(x[, "Lat"])))] ``` ### Collapse list to `data.frame` ```{r collapse list to data.frame, eval = FALSE} res_df_names <- unique(unlist(lapply(res, names))) res_df <- data.frame(matrix(NA, nrow = length(res), ncol = length(res_df_names))) names(res_df) <- res_df_names for (i in seq_len(length(res))) { dt_pad <- data.frame(matrix(NA, nrow = 1, ncol = length(res_df_names) - ncol(res[[i]]))) names(dt_pad) <- res_df_names[!(res_df_names %in% names(res[[i]]))] dt <- cbind(res[[i]], dt_pad) dt <- dt[, res_df_names] res_df[i, ] <- dt } ``` ```{r echo=FALSE, eval=FALSE} good_cols <- data.frame(as.numeric(as.character(apply(res_df, 2, function(x) sum(!is.na(x)))))) good_cols <- cbind(good_cols, names(res_df)) good_cols <- good_cols[good_cols[, 1] > 20, 2] good_cols <- as.character(good_cols) res_df <- res_df[, good_cols] ``` ```{r echo = FALSE} data(milakes) res_df <- milakes ``` ### Map lakes ```{r map lakes, fig.height=6,fig.align="center"} library(sp) coordinates(res_df) <- ~ Lon + Lat map("state", region = "michigan", mar = c(0, 0, 0, 0)) points(res_df, col = "red", pch = 19) ``` ```{r lake depth distribution } hist(log(res_df$`Max. depth`), main = "", xlab = "Max depth (log(m))") ```