Chapter 6 Extract content

6.1 Import Word document

The function docx_summary() returns the content of a Word document.

library(officer)
example_docx <- system.file(package = "officer", "doc_examples/example.docx")
doc <- read_docx(example_docx)
content <- docx_summary(doc)
head(content)

Explore the results:

tapply(content$doc_index, 
       content$content_type, 
       function(x) length(unique(x)))
#>  paragraph table cell 
#>         17          1

To get all paragraphs:

par_data <- subset(content, content_type %in% "paragraph") 
par_data <- par_data[, c("doc_index", "style_name", 
                         "text", "level", "num_id") ]
par_data$text <- with(par_data, {
  substr(
    text, start = 1, 
    stop = ifelse(nchar(text)<30, nchar(text), 30) )
})
par_data

6.1.1 Word tables

Tables are unstacked:

table_cells <- subset(content, content_type %in% "table cell")
print(head( table_cells) )
#>      doc_index content_type    style_name        text level num_id row_id
#> 1.1         16   table cell Light Shading      Petals    NA     NA      1
#> 1.11        16   table cell Light Shading 5,621498349    NA     NA      2
#> 1.12        16   table cell Light Shading 4,994616997    NA     NA      3
#> 1.13        16   table cell Light Shading 4,767504884    NA     NA      4
#> 1.14        16   table cell Light Shading  25,9242382    NA     NA      5
#> 1.15        16   table cell Light Shading 6,489375001    NA     NA      6
#>      is_header cell_id col_span row_span
#> 1.1       TRUE       1        1        1
#> 1.11     FALSE       1        2        1
#> 1.12     FALSE       1        1        1
#> 1.13     FALSE       1        1        1
#> 1.14     FALSE       1        2        1
#> 1.15     FALSE       1        1        1

Cells positions and values are dispatched in columns row_id, cell_id, text and is_header (a logical column indicating if the cell is part of a header or not). Note that the content itself (column text) is a character vector.

table_body <- subset(table_cells, !is_header)
table_body <- table_body[,c("row_id", "cell_id", "text")]
head(table_body)

Reshaping the data with columns row_id, cell_id and text would display something close to the orginal table:

tapply(table_body$text, 
       list(row_id = table_body$row_id, 
            cell_id = table_body$cell_id
            ), 
       FUN = I
       )
#>       cell_id
#> row_id 1             2             3                      
#>     2  "5,621498349" NA            "2,46210657918,2034091"
#>     3  "4,994616997" "AA"          "2,429320759"          
#>     4  "4,767504884" NA            "AAA"                  
#>     5  "25,9242382"  NA            "2,066051345"          
#>     6  "6,489375001" "25,21130805" "2,901582763"          
#>     7  "5,7858682"   "25,52433147" "2,655642742"          
#>     8  "5,645575295" "Merged cell" "2,278691288"          
#>     9  "4,828953215" NA            "2,238467716"          
#>     10 "6,783500773" NA            "2,202762147"          
#>     11 "5,395076839" NA            "2,538375992"          
#>     12 "4,683617783" "29,2459239"  "2,601945544"          
#>     13 "Note"        NA            NA                     
#>       cell_id
#> row_id 4                                 
#>     2  NA                                
#>     3  "17,65204912"                     
#>     4  NA                                
#>     5  "18,37915478"                     
#>     6  "17,3130473717,0721572418,2902189"
#>     7  NA                                
#>     8  NA                                
#>     9  "19,87376227"                     
#>     10 "19,85326662"                     
#>     11 "19,56545356"                     
#>     12 "18,95335451"                     
#>     13 NA

Getting headers requires another operation:

data <- subset(table_cells, is_header) 
data <- data[, c("row_id", "cell_id", "text") ] 

tapply(data$text, 
   list(row_id = data$row_id, 
        cell_id = data$cell_id
        ), FUN = I )
#>       cell_id
#> row_id 1        2           3       4      
#>      1 "Petals" "Internode" "Sepal" "Bract"

6.2 Import PowerPoint document

The function pptx_summary() returns the content of a PowerPoint document.

example_pptx <- system.file(package = "officer", "doc_examples/example.pptx")
file.copy(example_pptx, to = "reports/example.pptx")
#> [1] FALSE

doc <- read_pptx("reports/example.pptx")
content <- pptx_summary(doc)
head(content)

Explore the results:

tapply(content$id, 
       content$content_type, 
       function(x) length(unique(x)))
#>      image  paragraph table cell 
#>          1          5          2

To get all paragraphs:

par_data <- subset(content, 
                   content_type %in% "paragraph", 
                   select = c(id, text) )
head(par_data)

To get an image:

image_row <- subset(content, content_type %in% "image")
img  <- file.path(tempdir(), "extract.png")
media_extract(doc, path = image_row$media_file, target = img)
#> [1] TRUE

6.2.1 PowerPoint tables

Tables are unstacked :

table_cells <- subset(content, content_type %in% "table cell")
head(table_cells)

Cells positions and values are dispatched in columns row_id, cell_id, text. Note that here there is no indicator for the table header.

data <- subset(table_cells, id == 18, c(row_id, cell_id, text) )
tapply(data$text, 
   list(row_id = data$row_id, 
        cell_id = data$cell_id
        ), FUN = I )
#>       cell_id
#> row_id 1           2          3               
#>      1 "Header 1 " "Header 2" "Header 3"      
#>      2 "A"         "12.23"    "blah blah"     
#>      3 "B"         "1.23"     "blah blah blah"
#>      4 "B"         "9.0"      "Salut"         
#>      5 "C"         "6"        "Hello"