Skip to content Skip to sidebar Skip to footer

Extracting Html Table With Rowspan Values

The data frame I create with the following code (using the RCurl and XML packages) puts the three letter team abbreviation into only the first row that it spans. Is there another p

Solution 1:

You are actually pretty close. The only thing you need to do is to get the data in the proper columns and rows as some of the rows have shifted to the left. You can achieve this as follows (with the help of the data.table and zoo packages):

# your original code
url <-"https://en.wikipedia.org/wiki/List_of_Major_League_Baseball_postseason_teams"
url_source <- readLines(url, encoding ="UTF-8")
playoffs <- data.frame(readHTMLTable(url_source, stringsAsFactors =F, header =T)[2])# assigning proper names to the columnsnames(playoffs)<-c("shortcode","franchise","years","appearances")# 1. shift the dat columnwise for the rows in which there is no shortcode# 2. fill the resulting NA's with the last observation# 3. only keep the last shortcode when the previous ones are the same#    because only there the shortcode matches the franchise name
library(data.table)
library(zoo)
setDT(playoffs)[nchar(shortcode)>3, `:=` (shortcode =NA,
                                            franchise = shortcode,
                                            years = franchise,
                                            appearances = years)][, shortcode := na.locf(shortcode)][shortcode == shift(shortcode,1L, type="lead"), shortcode :=NA]

Solution 2:

Here's an answer. I took the liberty up the data

library(dplyr)
library(XML)
library(RCurl)
library(stringi)
library(zoo)
library(tidyr)

initial_data =
  "https://en.wikipedia.org/wiki/List_of_Major_League_Baseball_postseason_teams" %>%
  readLines(encoding = "UTF-8") %>%
  readHTMLTable(stringsAsFactors = F) %>%
  `[[`(2) %>%
  mutate(ID =1:n(),
         test =
           V1 %>%
           stri_detect_regex("^[A-Z]{2,3}$"))

variable_names = c("franchise",
                   "years",
                   "initial_postseason_appearances")

shifts = 
  initial_data %>%filter(test %>% `!`) %>%
  setNames(c(variable_names,
             "trash",
             "ID",
             "test"))

team_initial =
  initial_data %>%filter(test) %>%
  setNames(c("initial_abbreviation",
             variable_names,
             "ID",
             "test")) %>%
  bind_rows(shifts) %>%
  arrange(ID) %>%
  separate(years, c("start", "end")) %>%
  mutate(abbreviation = initial_abbreviation %>% na.locf,
         split_postseason_appearances =
           initial_postseason_appearances %>%
           plyr::mapvalues("–", NA) %>%
           stri_split_fixed(", ") )

appearance = 
  team_initial %>%select(franchise,
         split_postseason_appearances) %>%unnest(split_postseason_appearances) %>%
  mutate(postseason_appearance =
           split_postseason_appearances %>%
           extract_numeric) %>%select(-split_postseason_appearances)

team = 
  team_initial %>%select(abbreviation,
         franchise,
         start,
         end)

Solution 3:

Consider an XML package solution, requiring various XPath expressions using xpathSApply() with for loop and if/then logic. To capture the row-spanned table records, various XPath string functions are used: string-length(), concat(), and substring():

library(XML)

# PARSE FROM URL
url <- "https://en.wikipedia.org/wiki/List_of_Major_League_Baseball_postseason_teams"
webpage <- readLines(url)
html = htmlTreeParse(webpage, useInternalNodes = TRUE, asText = TRUE)

# INITIALIZE LISTS
code <- c()
team <- c()
year <- c()
postseason <- c()

# APPEND TO LISTS LOOPING ACROSS ALL TEAMS
numberofteams <- length(xpathSApply(html, "//table[2]//tr/td[1]"))

for (i in (1:numberofteams+1)) {
  # TR NODES WITH LETTER TEAM ABBREVIATION (STRING LENGTH=2 or 3)if (as.character(xpathSApply(html, sprintf("string-length(//table[2]/tr[%s]/td[1])", i), xmlValue)) %in% c("2","3")) {

    code <- c(code, xpathSApply(html, sprintf("//table[2]/tr[%s]/td[1]", i), xmlValue))
    team <- c(team, xpathSApply(html, sprintf("//table[2]/tr[%s]/td[2]", i), xmlValue))
    year <- c(year, xpathSApply(html, sprintf("//table[2]/tr[%s]/td[3]", i), xmlValue))
    postseason <- c(postseason, xpathSApply(html, sprintf("//table[2]/tr[%s]/td[4]", i), xmlValue))
  } else {
    # TR NODES W/O LETTER TEAM ABBREVIATION       
    code <- c(code, xpathSApply(html, sprintf("substring(concat(//table[2]/tr[position()=%s-1]/td[position()=1 and string-length(.)=3],
                                               //table[2]/tr[position()=%s-2]/td[position()=1 and string-length(.)=3]), 1, 3)", i, i), xmlValue))
    team <- c(team, xpathSApply(html, sprintf("//table[2]/tr[%s]/td[1]", i), xmlValue))
    year <- c(year, xpathSApply(html, sprintf("//table[2]/tr[%s]/td[2]", i), xmlValue))
    postseason <- c(postseason, xpathSApply(html, sprintf("//table[2]/tr[%s]/td[3]", i), xmlValue))        

  }
}

# COMBINE LISTS INTO DATA FRAME
playoffs <- data.frame(code = unlist(code), 
                       team = unlist(team), 
                       year = unlist(year), 
                       postseason = unlist(postseason))

Solution 4:

Give htmltab a try:

install.packages("htmltab")
library(htmltab)

purl <- htmlParse(url_source)
htmltab(purl, which = 2)

Post a Comment for "Extracting Html Table With Rowspan Values"