library(rvest)
library(tidyverse)
library(xml2)
library(leaflet)
doc <- read_html("http://en.wikipedia.org/wiki/List_of_modern_obelisks", trim = T)
tables <- doc |>
html_nodes(".wikitable") |>
as.list()Scraping a Wikipedia table with images in rvest is annoying, as seemingly indicated by the Stack Overflow answers to people with similar problems.
My goal is to represent “List of Modern Obelisks” as a Leaflet map showing the global distribution of the monuments. The core problem is that html_table appears to use xml_text to extract text information. What I want is a column giving the src= url, so I can include an image in the Leaflet popup for each obelisk.
There are multiple tables in this list, one for each century. I can start by getting a list of these tables by selecting .wikitable objects from our html response.
Instead of using html_table I’ll iterate through each row, extracting the text and image link separately, then building a new row.
clean.row <- function(broken.row){
# Get table values from row
broken.row <- broken.row |>
html_nodes("td")
# Get just the texxt (breaks links)
cleaned.row <- broken.row |> xml_text()
# Select image element in row, extract link
image.link <- broken.row |> html_nodes(".mw-file-element") |> html_attr("srcset")
# If there is an image link
if(!(is.null(image.link)|length(image.link) == 0)){
# Replace missing image value with the link
cleaned.row[2] <- image.link
} else {
# Otherwise replace with nothing
cleaned.row[2] <- ""
}
# Convert row to tibble
new_tib <- cleaned.row |> as_tibble_row(.name_repair = "unique")
# Set names
colnames(new_tib) <- c("name", "image_url", "location", "country", "elevation_m", "elevation_f", "years", "coords", "notes")
new_tib
}
get.table <- function(table){
# Get list of the table's rows
table.rows <- table |>
html_nodes("tr")
# For each row (skipping the header rows), clean the row and combine the tibble
bind_rows(map(table.rows[3:length(table.rows)], clean.row))
}
# Call get.table on each of the tables, combining all of the rows
obelisks.raw.scraped <- bind_rows(map(tables, get.table))This results in a combined tibble with everything we need. I have to do some cleaning before making my map.
fix.image.links <- function(l) {
l |>
str_remove(pattern = " 1.5x| 2x") |>
str_squish() |>
str_replace(pattern = "[//]{2}", replacement = "https://")
}
fix.coords <- function(c) str_split_1(c, " / ") |> tail(n = 1) |> str_replace(pattern = ";", replacement = ",")
fix.coords.v <- Vectorize(fix.coords)
obelisks.cleaned <- obelisks.raw.scraped |>
# Separate default url values (two separate links to different sized versions)
separate_wider_delim(cols = image_url, delim = ",", names = c("url1", "url2"), too_few = "align_start") |>
mutate(across(starts_with("url"), ~ fix.image.links(.))) |>
# Coords originally come with three different formats
mutate(coords = fix.coords.v(coords)) |>
separate_wider_delim(cols = coords, delim = ",", names = c("lat", "long"), too_few = "align_start") |>
mutate(across(c(lat, long), ~ as.numeric(.)))To plot the result, we need some custom HTML for the popup. I asked ChatGPT to generate the structure on a whim and was suprised by how good the result was, though it did need some tweaking. This is the prompt I used:
Create styled html for a popup pane for leaflet with two columns side by side. The left column fits a portrait oriented photo with about 30% of the entire width. The right side has a title and various fields with values underneath it
get.html <-
function(monument_name,
img_url,
year_constructed,
monument_location,
monument_country,
altitude_feet,
altitude_meters) {
paste0(
"
<!DOCTYPE html>
<html>
<head>
<style>
/* Popup Container Styles */
.popup-container {
width: 300px; /* Adjust the width as needed */
padding: 10px;
display: flex;
}
.left-column {
flex: 1;
max-width: 40%;
max-height: 225px;
position: relative;
overflow: hidden;
}
/* Right Column Styles */
.right-column {
flex: 2;
padding-left: 10px;
align-self: center;
}
/* Image Styles */
.popup-image {
max-width: 100%;
height: auto;
max-height: 200px;
}
/* Title Style */
.popup-title {
font-size: 14px;
font-weight: bold;
margin-bottom: 5px;
}
/* Field Styles */
.popup-field {
font-size: 14px;
margin-bottom: 3px;
}
</style>
</head>
<body>
<div class='popup-container'>
<div class='left-column'>
<img class= 'popup-image', src='",
img_url,
"'>
</div>
<div class='right-column'>
<div class='popup-title'>",
monument_name,
"</div>
<div class='popup-field'><strong>Country:</strong> ",
monument_country,
"</div>
<div class='popup-field'><strong>Location:</strong> ",
monument_location,
"</div>
<div class='popup-field'><strong>Constructed:</strong> ",
year_constructed,
"</div>
<div class='popup-field'><strong>Altitude:</strong> ",
altitude_meters,
" m. </div>
</div>
</div>
</body>
</html>"
)
}The rest is pretty straightforward. I pass values from the dataframe to get.html from above, which generates a unique popup for each obelisk.
obelisks.plot <- obelisks.cleaned |>
mutate(
popup = get.html(
monument_name = name,
img_url = url1,
year_constructed = years,
monument_location = location,
monument_country = country,
altitude_feet = elevation_f,
altitude_meters = elevation_m
)
)
leaflet(obelisks.cleaned) |>
addTiles() |>
addMarkers(
lat = as.numeric(obelisks.plot$lat),
lng = as.numeric(obelisks.plot$long),
popup = obelisks.plot$popup
) Here’s everything all together:
library(rvest)
library(tidyverse)
library(xml2)
library(leaflet)
doc <-
read_html("http://en.wikipedia.org/wiki/List_of_modern_obelisks",
trim = T)
tables <- doc |>
html_nodes(".wikitable") |>
as.list()
clean.row <- function(broken.row) {
# Get table values from row
broken.row <- broken.row |>
html_nodes("td")
# Get just the texxt (breaks links)
cleaned.row <- broken.row |> xml_text()
# Select image element in row, extract link
image.link <-
broken.row |> html_nodes(".mw-file-element") |> html_attr("srcset")
# If there is an image link
if (!(is.null(image.link) | length(image.link) == 0)) {
# Replace missing image value with the link
cleaned.row[2] <- image.link
} else {
# Otherwise replace with nothing
cleaned.row[2] <- ""
}
# Convert row to tibble
new_tib <-
cleaned.row |> as_tibble_row(.name_repair = "unique")
# Set names
colnames(new_tib) <-
c(
"name",
"image_url",
"location",
"country",
"elevation_m",
"elevation_f",
"years",
"coords",
"notes"
)
new_tib
}
get.table <- function(table) {
table.rows <- table |>
html_nodes("tr")
bind_rows(map(table.rows[3:length(table.rows)], clean.row))
}
obelisks.raw.scraped <- bind_rows(map(tables, get.table))
### Clean obelisk data
fix.image.links <- function(l) {
l |>
str_remove(pattern = " 1.5x| 2x") |>
str_squish() |>
str_replace(pattern = "[//]{2}", replacement = "https://")
}
fix.coords <- function(c) str_split_1(c, " / ") |> tail(n = 1) |> str_replace(pattern = ";", replacement = ",")
fix.coords.v <- Vectorize(fix.coords)
obelisks.cleaned <- obelisks.raw.scraped |>
separate_wider_delim(cols = image_url, delim = ",", names = c("url1", "url2"), too_few = "align_start") |>
mutate(across(starts_with("url"), ~ fix.image.links(.))) |>
mutate(coords = fix.coords.v(coords)) |>
separate_wider_delim(cols = coords, delim = ",", names = c("lat", "long"), too_few = "align_start") |>
mutate(across(c(lat, long), ~ as.numeric(.)))
### Plotting
get.html <-
function(monument_name,
img_url,
year_constructed,
monument_location,
monument_country,
altitude_feet,
altitude_meters) {
paste0(
"
<!DOCTYPE html>
<html>
<head>
<style>
/* Popup Container Styles */
.popup-container {
width: 300px; /* Adjust the width as needed */
padding: 10px;
display: flex;
}
.left-column {
flex: 1;
max-width: 40%;
max-height: 225px;
position: relative;
overflow: hidden;
}
/* Right Column Styles */
.right-column {
flex: 2;
padding-left: 10px;
align-self: center;
}
/* Image Styles */
.popup-image {
max-width: 100%;
height: auto;
max-height: 200px;
}
/* Title Style */
.popup-title {
font-size: 14px;
font-weight: bold;
margin-bottom: 5px;
}
/* Field Styles */
.popup-field {
font-size: 14px;
margin-bottom: 3px;
}
</style>
</head>
<body>
<div class='popup-container'>
<div class='left-column'>
<img class= 'popup-image', src='",
img_url,
"'>
</div>
<div class='right-column'>
<div class='popup-title'>",
monument_name,
"</div>
<div class='popup-field'><strong>Country:</strong> ",
monument_country,
"</div>
<div class='popup-field'><strong>Location:</strong> ",
monument_location,
"</div>
<div class='popup-field'><strong>Constructed:</strong> ",
year_constructed,
"</div>
<div class='popup-field'><strong>Altitude:</strong> ",
altitude_meters,
" m. </div>
</div>
</div>
</body>
</html>"
)
}
obelisks.plot <- obelisks.cleaned |>
mutate(
popup = get.html(
monument_name = name,
img_url = url1,
year_constructed = years,
monument_location = location,
monument_country = country,
altitude_feet = elevation_f,
altitude_meters = elevation_m
)
)
leaflet(obelisks.cleaned) |>
addTiles() |>
addMarkers(
lat = as.numeric(obelisks.plot$lat),
lng = as.numeric(obelisks.plot$long),
popup = obelisks.plot$popup
) 



