Chapter 6 Extract content

6.1 Import Word document

The function docx_summary() returns the content of a Word document.

library(officer)
example_docx <- system.file(package = "officer", "doc_examples/example.docx")
doc <- read_docx(example_docx)
content <- docx_summary(doc)
head(content)

Explore the results:

tapply(content$doc_index, 
       content$content_type, 
       function(x) length(unique(x)))
#  paragraph table cell 
#         17          1

To get all paragraphs:

par_data <- subset(content, content_type %in% "paragraph") 
par_data <- par_data[, c("doc_index", "style_name", 
                         "text", "level", "num_id") ]
par_data$text <- with(par_data, {
  substr(
    text, start = 1, 
    stop = ifelse(nchar(text)<30, nchar(text), 30) )
})
par_data

There is no support to extract images stored in Word paragraphs.

6.1.1 Word tables

Tables are unstacked:

table_cells <- subset(content, content_type %in% "table cell")
print(head( table_cells) )
#      doc_index content_type    style_name        text level num_id row_id
# 1.1         16   table cell Light Shading      Petals    NA     NA      1
# 1.11        16   table cell Light Shading 5,621498349    NA     NA      2
# 1.12        16   table cell Light Shading 4,994616997    NA     NA      3
# 1.13        16   table cell Light Shading 4,767504884    NA     NA      4
# 1.14        16   table cell Light Shading  25,9242382    NA     NA      5
# 1.15        16   table cell Light Shading 6,489375001    NA     NA      6
#      is_header cell_id col_span row_span
# 1.1       TRUE       1        1        1
# 1.11     FALSE       1        2        1
# 1.12     FALSE       1        1        1
# 1.13     FALSE       1        1        1
# 1.14     FALSE       1        2        1
# 1.15     FALSE       1        1        1

Cells positions and values are dispatched in columns row_id, cell_id, text and is_header (a logical column indicating if the cell is part of a header or not). Note that the content itself (column text) is a character vector.

table_body <- subset(table_cells, !is_header)
table_body <- table_body[,c("row_id", "cell_id", "text")]
head(table_body)

Reshaping the data with columns row_id, cell_id and text would display something close to the orginal table:

tapply(table_body$text, 
       list(row_id = table_body$row_id, 
            cell_id = table_body$cell_id
            ), 
       FUN = I
       )
#       cell_id
# row_id 1             2             3                      
#     2  "5,621498349" NA            "2,46210657918,2034091"
#     3  "4,994616997" "AA"          "2,429320759"          
#     4  "4,767504884" NA            "AAA"                  
#     5  "25,9242382"  NA            "2,066051345"          
#     6  "6,489375001" "25,21130805" "2,901582763"          
#     7  "5,7858682"   "25,52433147" "2,655642742"          
#     8  "5,645575295" "Merged cell" "2,278691288"          
#     9  "4,828953215" NA            "2,238467716"          
#     10 "6,783500773" NA            "2,202762147"          
#     11 "5,395076839" NA            "2,538375992"          
#     12 "4,683617783" "29,2459239"  "2,601945544"          
#     13 "Note"        NA            NA                     
#       cell_id
# row_id 4                                 
#     2  NA                                
#     3  "17,65204912"                     
#     4  NA                                
#     5  "18,37915478"                     
#     6  "17,3130473717,0721572418,2902189"
#     7  NA                                
#     8  NA                                
#     9  "19,87376227"                     
#     10 "19,85326662"                     
#     11 "19,56545356"                     
#     12 "18,95335451"                     
#     13 NA

Getting headers requires another operation:

data <- subset(table_cells, is_header) 
data <- data[, c("row_id", "cell_id", "text") ] 

tapply(data$text, 
   list(row_id = data$row_id, 
        cell_id = data$cell_id
        ), FUN = I )
#       cell_id
# row_id 1        2           3       4      
#      1 "Petals" "Internode" "Sepal" "Bract"

6.2 Import PowerPoint document

The function pptx_summary() returns the content of a PowerPoint document.

example_pptx <- system.file(package = "officer", "doc_examples/example.pptx")
file.copy(example_pptx, to = "reports/example.pptx")
# [1] FALSE

doc <- read_pptx("reports/example.pptx")
content <- pptx_summary(doc)
head(content)

Explore the results:

tapply(content$id, 
       content$content_type, 
       function(x) length(unique(x)))
#      image  paragraph table cell 
#          1          5          2

To get all paragraphs:

par_data <- subset(content, 
                   content_type %in% "paragraph", 
                   select = c(id, text) )
head(par_data)

To get an image:

image_row <- subset(content, content_type %in% "image")
img  <- file.path(tempdir(), "extract.png")
media_extract(doc, path = image_row$media_file, target = img)
# [1] TRUE

6.2.1 PowerPoint tables

Tables are unstacked :

table_cells <- subset(content, content_type %in% "table cell")
head(table_cells)

Cells positions and values are dispatched in columns row_id, cell_id, text. Note that here there is no indicator for the table header.

data <- subset(table_cells, id == 18, c(row_id, cell_id, text) )
tapply(data$text, 
   list(row_id = data$row_id, 
        cell_id = data$cell_id
        ), FUN = I )
#       cell_id
# row_id 1           2          3               
#      1 "Header 1 " "Header 2" "Header 3"      
#      2 "A"         "12.23"    "blah blah"     
#      3 "B"         "1.23"     "blah blah blah"
#      4 "B"         "9.0"      "Salut"         
#      5 "C"         "6"        "Hello"