library(rvest)
library(tidyverse)
library(xml2)
library(leaflet)
<- read_html("http://en.wikipedia.org/wiki/List_of_modern_obelisks", trim = T)
doc
<- doc |>
tables html_nodes(".wikitable") |>
as.list()
Scraping a Wikipedia table with images in rvest
is annoying, as seemingly indicated by the Stack Overflow answers to people with similar problems.
My goal is to represent “List of Modern Obelisks” as a Leaflet map showing the global distribution of the monuments. The core problem is that html_table
appears to use xml_text
to extract text information. What I want is a column giving the src=
url, so I can include an image in the Leaflet popup for each obelisk.
There are multiple tables in this list, one for each century. I can start by getting a list of these tables by selecting .wikitable
objects from our html response.
Instead of using html_table
I’ll iterate through each row, extracting the text and image link separately, then building a new row.
<- function(broken.row){
clean.row
# Get table values from row
<- broken.row |>
broken.row html_nodes("td")
# Get just the texxt (breaks links)
<- broken.row |> xml_text()
cleaned.row
# Select image element in row, extract link
<- broken.row |> html_nodes(".mw-file-element") |> html_attr("srcset")
image.link
# If there is an image link
if(!(is.null(image.link)|length(image.link) == 0)){
# Replace missing image value with the link
2] <- image.link
cleaned.row[else {
} # Otherwise replace with nothing
2] <- ""
cleaned.row[
}# Convert row to tibble
<- cleaned.row |> as_tibble_row(.name_repair = "unique")
new_tib # Set names
colnames(new_tib) <- c("name", "image_url", "location", "country", "elevation_m", "elevation_f", "years", "coords", "notes")
new_tib
}
<- function(table){
get.table # Get list of the table's rows
<- table |>
table.rows html_nodes("tr")
# For each row (skipping the header rows), clean the row and combine the tibble
bind_rows(map(table.rows[3:length(table.rows)], clean.row))
}
# Call get.table on each of the tables, combining all of the rows
<- bind_rows(map(tables, get.table)) obelisks.raw.scraped
This results in a combined tibble with everything we need. I have to do some cleaning before making my map.
<- function(l) {
fix.image.links |>
l str_remove(pattern = " 1.5x| 2x") |>
str_squish() |>
str_replace(pattern = "[//]{2}", replacement = "https://")
}
<- function(c) str_split_1(c, " / ") |> tail(n = 1) |> str_replace(pattern = ";", replacement = ",")
fix.coords
<- Vectorize(fix.coords)
fix.coords.v
<- obelisks.raw.scraped |>
obelisks.cleaned # Separate default url values (two separate links to different sized versions)
separate_wider_delim(cols = image_url, delim = ",", names = c("url1", "url2"), too_few = "align_start") |>
mutate(across(starts_with("url"), ~ fix.image.links(.))) |>
# Coords originally come with three different formats
mutate(coords = fix.coords.v(coords)) |>
separate_wider_delim(cols = coords, delim = ",", names = c("lat", "long"), too_few = "align_start") |>
mutate(across(c(lat, long), ~ as.numeric(.)))
To plot the result, we need some custom HTML for the popup. I asked ChatGPT to generate the structure on a whim and was suprised by how good the result was, though it did need some tweaking. This is the prompt I used:
Create styled html for a popup pane for leaflet with two columns side by side. The left column fits a portrait oriented photo with about 30% of the entire width. The right side has a title and various fields with values underneath it
<-
get.html function(monument_name,
img_url,
year_constructed,
monument_location,
monument_country,
altitude_feet,
altitude_meters) {paste0(
"
<!DOCTYPE html>
<html>
<head>
<style>
/* Popup Container Styles */
.popup-container {
width: 300px; /* Adjust the width as needed */
padding: 10px;
display: flex;
}
.left-column {
flex: 1;
max-width: 40%;
max-height: 225px;
position: relative;
overflow: hidden;
}
/* Right Column Styles */
.right-column {
flex: 2;
padding-left: 10px;
align-self: center;
}
/* Image Styles */
.popup-image {
max-width: 100%;
height: auto;
max-height: 200px;
}
/* Title Style */
.popup-title {
font-size: 14px;
font-weight: bold;
margin-bottom: 5px;
}
/* Field Styles */
.popup-field {
font-size: 14px;
margin-bottom: 3px;
}
</style>
</head>
<body>
<div class='popup-container'>
<div class='left-column'>
<img class= 'popup-image', src='",
img_url,"'>
</div>
<div class='right-column'>
<div class='popup-title'>",
monument_name,"</div>
<div class='popup-field'><strong>Country:</strong> ",
monument_country,"</div>
<div class='popup-field'><strong>Location:</strong> ",
monument_location,"</div>
<div class='popup-field'><strong>Constructed:</strong> ",
year_constructed,"</div>
<div class='popup-field'><strong>Altitude:</strong> ",
altitude_meters," m. </div>
</div>
</div>
</body>
</html>"
) }
The rest is pretty straightforward. I pass values from the dataframe to get.html
from above, which generates a unique popup for each obelisk.
<- obelisks.cleaned |>
obelisks.plot mutate(
popup = get.html(
monument_name = name,
img_url = url1,
year_constructed = years,
monument_location = location,
monument_country = country,
altitude_feet = elevation_f,
altitude_meters = elevation_m
)
)
leaflet(obelisks.cleaned) |>
addTiles() |>
addMarkers(
lat = as.numeric(obelisks.plot$lat),
lng = as.numeric(obelisks.plot$long),
popup = obelisks.plot$popup
)
Here’s everything all together:
library(rvest)
library(tidyverse)
library(xml2)
library(leaflet)
<-
doc read_html("http://en.wikipedia.org/wiki/List_of_modern_obelisks",
trim = T)
<- doc |>
tables html_nodes(".wikitable") |>
as.list()
<- function(broken.row) {
clean.row # Get table values from row
<- broken.row |>
broken.row html_nodes("td")
# Get just the texxt (breaks links)
<- broken.row |> xml_text()
cleaned.row
# Select image element in row, extract link
<-
image.link |> html_nodes(".mw-file-element") |> html_attr("srcset")
broken.row
# If there is an image link
if (!(is.null(image.link) | length(image.link) == 0)) {
# Replace missing image value with the link
2] <- image.link
cleaned.row[else {
} # Otherwise replace with nothing
2] <- ""
cleaned.row[
}# Convert row to tibble
<-
new_tib |> as_tibble_row(.name_repair = "unique")
cleaned.row # Set names
colnames(new_tib) <-
c(
"name",
"image_url",
"location",
"country",
"elevation_m",
"elevation_f",
"years",
"coords",
"notes"
)
new_tib
}
<- function(table) {
get.table <- table |>
table.rows html_nodes("tr")
bind_rows(map(table.rows[3:length(table.rows)], clean.row))
}
<- bind_rows(map(tables, get.table))
obelisks.raw.scraped
### Clean obelisk data
<- function(l) {
fix.image.links |>
l str_remove(pattern = " 1.5x| 2x") |>
str_squish() |>
str_replace(pattern = "[//]{2}", replacement = "https://")
}
<- function(c) str_split_1(c, " / ") |> tail(n = 1) |> str_replace(pattern = ";", replacement = ",")
fix.coords
<- Vectorize(fix.coords)
fix.coords.v
<- obelisks.raw.scraped |>
obelisks.cleaned separate_wider_delim(cols = image_url, delim = ",", names = c("url1", "url2"), too_few = "align_start") |>
mutate(across(starts_with("url"), ~ fix.image.links(.))) |>
mutate(coords = fix.coords.v(coords)) |>
separate_wider_delim(cols = coords, delim = ",", names = c("lat", "long"), too_few = "align_start") |>
mutate(across(c(lat, long), ~ as.numeric(.)))
### Plotting
<-
get.html function(monument_name,
img_url,
year_constructed,
monument_location,
monument_country,
altitude_feet,
altitude_meters) {paste0(
"
<!DOCTYPE html>
<html>
<head>
<style>
/* Popup Container Styles */
.popup-container {
width: 300px; /* Adjust the width as needed */
padding: 10px;
display: flex;
}
.left-column {
flex: 1;
max-width: 40%;
max-height: 225px;
position: relative;
overflow: hidden;
}
/* Right Column Styles */
.right-column {
flex: 2;
padding-left: 10px;
align-self: center;
}
/* Image Styles */
.popup-image {
max-width: 100%;
height: auto;
max-height: 200px;
}
/* Title Style */
.popup-title {
font-size: 14px;
font-weight: bold;
margin-bottom: 5px;
}
/* Field Styles */
.popup-field {
font-size: 14px;
margin-bottom: 3px;
}
</style>
</head>
<body>
<div class='popup-container'>
<div class='left-column'>
<img class= 'popup-image', src='",
img_url,"'>
</div>
<div class='right-column'>
<div class='popup-title'>",
monument_name,"</div>
<div class='popup-field'><strong>Country:</strong> ",
monument_country,"</div>
<div class='popup-field'><strong>Location:</strong> ",
monument_location,"</div>
<div class='popup-field'><strong>Constructed:</strong> ",
year_constructed,"</div>
<div class='popup-field'><strong>Altitude:</strong> ",
altitude_meters," m. </div>
</div>
</div>
</body>
</html>"
)
}
<- obelisks.cleaned |>
obelisks.plot mutate(
popup = get.html(
monument_name = name,
img_url = url1,
year_constructed = years,
monument_location = location,
monument_country = country,
altitude_feet = elevation_f,
altitude_meters = elevation_m
)
)
leaflet(obelisks.cleaned) |>
addTiles() |>
addMarkers(
lat = as.numeric(obelisks.plot$lat),
lng = as.numeric(obelisks.plot$long),
popup = obelisks.plot$popup
)