library(tidyverse) library(httr) library(jsonlite) library(lubridate) library(rvest) urls = c('https://geizhals.eu/gigabyte-geforce-rtx-3080-gaming-oc-10g-gv-n3080gaming-oc-10gd-a2366735.html', 'https://geizhals.eu/gigabyte-geforce-rtx-3070-gaming-oc-8g-gv-n3070gaming-oc-8gd-a2406936.html?hloc=at&hloc=de&hloc=eu&hloc=pl&hloc=uk', 'https://geizhals.eu/msi-geforce-rtx-3080-ti-gaming-x-trio-12g-a2538377.html?hloc=at&hloc=de&hloc=eu&hloc=pl&hloc=uk', 'https://geizhals.eu/powercolor-radeon-rx-6700-xt-red-devil-axrx-6700xt-12gbd6-3dhe-oc-a2494750.html') get_geizhals_data = function(url){ read_html(url) %>% html_text() -> html_raw html_raw %>% str_extract('(?<=productIds: )[0-9]+') -> id html_raw %>% str_extract("(?<=productName: )'.+'") %>% str_remove("'") -> name request_url = 'https://geizhals.eu/api/gh0/price_history' args = list(id = id, params = list(days = 9999, loc = 'eu')) body = toJSON(args, auto_unbox = TRUE) raw = POST(request_url, body = body, encode='json') tmp = fromJSON(content(raw, as='text')) tmp_df = data.frame(tmp$response) colnames(tmp_df) = c('timestamp', 'price', 'availability') tmp_df %>% mutate(date = as.Date(as.POSIXct(timestamp/1000, origin="1970-01-01"))) %>% select(-timestamp) -> tmp_df tmp_df$id_gpu = id tmp_df$name = name tmp_df$url = url return(tmp_df) } gpu_data = map_df(urls, get_geizhals_data)