Chapter 7 Extract content

7.1 Import Word document

The function docx_summary() returns the content of a Word document.

library(officer)
example_docx <- system.file(package = "officer", "doc_examples/example.docx")
doc <- read_docx(example_docx)
content <- docx_summary(doc)
head(content)

doc_index

content_type

style_name

text

level

num_id

row_id

is_header

cell_id

col_span

row_span

integer

character

character

character

numeric

integer

integer

logical

numeric

numeric

integer

1

paragraph

heading 1

Title 1

2

paragraph

Lorem ipsum dolor sit amet, consectetur adipiscing elit.

3

paragraph

heading 1

Title 2

4

paragraph

List Paragraph

Quisque tristique

1

2

5

paragraph

List Paragraph

Augue nisi, et convallis

1

2

6

paragraph

List Paragraph

Sapien mollis nec.

1

2

n: 6

Explore the results:

tapply(content$doc_index, 
       content$content_type, 
       function(x) length(unique(x)))
#  paragraph table cell 
#         17          1

To get all paragraphs:

par_data <- subset(content, content_type %in% "paragraph") 
par_data <- par_data[, c("doc_index", "style_name", 
                         "text", "level", "num_id") ]
par_data$text <- with(par_data, {
  substr(
    text, start = 1, 
    stop = ifelse(nchar(text)<30, nchar(text), 30) )
})
par_data

doc_index

style_name

text

level

num_id

integer

character

character

numeric

integer

1

heading 1

Title 1

2

Lorem ipsum dolor sit amet, co

3

heading 1

Title 2

4

List Paragraph

Quisque tristique

1

2

5

List Paragraph

Augue nisi, et convallis

1

2

6

List Paragraph

Sapien mollis nec.

1

2

7

heading 2

Sub title 1

8

List Paragraph

Quisque tristique

1

1

9

List Paragraph

Augue nisi, et convallis

1

1

10

List Paragraph

Sapien mollis nec.

1

1

11

12

Phasellus nec nunc vitae nulla

13

heading 2

Sub title 2

14

Morbi rhoncus sapien sit amet

15

17

18

n: 17

There is no support to extract images stored in Word paragraphs.

7.1.1 Word tables

Tables are unstacked:

table_cells <- subset(content, content_type %in% "table cell")
print(head( table_cells) )
#      doc_index content_type    style_name        text level num_id row_id
# 1.1         16   table cell Light Shading      Petals    NA     NA      1
# 1.11        16   table cell Light Shading 5,621498349    NA     NA      2
# 1.12        16   table cell Light Shading 4,994616997    NA     NA      3
# 1.13        16   table cell Light Shading 4,767504884    NA     NA      4
# 1.14        16   table cell Light Shading  25,9242382    NA     NA      5
# 1.15        16   table cell Light Shading 6,489375001    NA     NA      6
#      is_header cell_id col_span row_span
# 1.1       TRUE       1        1        1
# 1.11     FALSE       1        2        1
# 1.12     FALSE       1        1        1
# 1.13     FALSE       1        1        1
# 1.14     FALSE       1        2        1
# 1.15     FALSE       1        1        1

Cells positions and values are dispatched in columns row_id, cell_id, text and is_header (a logical column indicating if the cell is part of a header or not). Note that the content itself (column text) is a character vector.

table_body <- subset(table_cells, !is_header)
table_body <- table_body[,c("row_id", "cell_id", "text")]
head(table_body)

row_id

cell_id

text

integer

numeric

character

2

1

5,621498349

3

1

4,994616997

4

1

4,767504884

5

1

25,9242382

6

1

6,489375001

7

1

5,7858682

n: 6

Reshaping the data with columns row_id, cell_id and text would display something close to the orginal table:

tapply(table_body$text, 
       list(row_id = table_body$row_id, 
            cell_id = table_body$cell_id
            ), 
       FUN = I
       )
#       cell_id
# row_id 1                   2             3                      
#     2  "5,621498349"       NA            "2,46210657918,2034091"
#     3  "4,994616997"       "AA"          "2,429320759"          
#     4  "4,767504884"       NA            "AAA"                  
#     5  "25,9242382"        NA            "2,066051345"          
#     6  "6,489375001"       "25,21130805" "2,901582763"          
#     7  "5,7858682"         "25,52433147" "2,655642742"          
#     8  "5,645575295"       "Merged cell" "2,278691288"          
#     9  "4,828953215"       NA            "2,238467716"          
#     10 "6,783500773"       NA            "2,202762147"          
#     11 "5,395076839"       NA            "2,538375992"          
#     12 "4,683617783"       "29,2459239"  "2,601945544"          
#     13 "NoteNew line note" NA            NA                     
#       cell_id
# row_id 4                                 
#     2  NA                                
#     3  "17,65204912"                     
#     4  NA                                
#     5  "18,37915478"                     
#     6  "17,3130473717,0721572418,2902189"
#     7  NA                                
#     8  NA                                
#     9  "19,87376227"                     
#     10 "19,85326662"                     
#     11 "19,56545356"                     
#     12 "18,95335451"                     
#     13 NA

Getting headers requires another operation:

data <- subset(table_cells, is_header) 
data <- data[, c("row_id", "cell_id", "text") ] 

tapply(data$text, 
   list(row_id = data$row_id, 
        cell_id = data$cell_id
        ), FUN = I )
#       cell_id
# row_id 1        2           3       4      
#      1 "Petals" "Internode" "Sepal" "Bract"

7.2 Import PowerPoint document

The function pptx_summary() returns the content of a PowerPoint document.

example_pptx <- system.file(package = "officer", "doc_examples/example.pptx")
file.copy(example_pptx, to = "static/reports/example.pptx")
# [1] FALSE

doc <- read_pptx("static/reports/example.pptx")
content <- pptx_summary(doc)
head(content)

text

id

content_type

slide_id

row_id

cell_id

col_span

row_span

media_file

character

character

character

integer

integer

integer

integer

integer

character

Title

12

paragraph

1

A table

13

paragraph

1

and some text

13

paragraph

1

and some list (1)

13

paragraph

1

and some list (2)

13

paragraph

1

Header 1

18

table cell

1

1

1

1

1

n: 6

Explore the results:

tapply(content$id, 
       content$content_type, 
       function(x) length(unique(x)))
#      image  paragraph table cell 
#          1          5          2

To get all paragraphs:

par_data <- subset(content, 
                   content_type %in% "paragraph", 
                   select = c(id, text) )
head(par_data)

id

text

character

character

12

Title

13

A table

13

and some text

13

and some list (1)

13

and some list (2)

15

R logo

n: 6

To get an image:

image_row <- subset(content, content_type %in% "image")
img  <- file.path(tempdir(), "extract.png")
media_extract(doc, path = image_row$media_file, target = img)
# [1] TRUE

7.2.1 PowerPoint tables

Tables are unstacked :

table_cells <- subset(content, content_type %in% "table cell")
head(table_cells)

text

id

content_type

slide_id

row_id

cell_id

col_span

row_span

media_file

character

character

character

integer

integer

integer

integer

integer

character

Header 1

18

table cell

1

1

1

1

1

A

18

table cell

1

2

1

1

1

B

18

table cell

1

3

1

1

1

B

18

table cell

1

4

1

1

1

C

18

table cell

1

5

1

1

1

Header 2

18

table cell

1

1

2

1

1

n: 6

Cells positions and values are dispatched in columns row_id, cell_id, text. Note that here there is no indicator for the table header.

data <- subset(table_cells, id == 18, c(row_id, cell_id, text) )
tapply(data$text, 
   list(row_id = data$row_id, 
        cell_id = data$cell_id
        ), FUN = I )
#       cell_id
# row_id 1           2          3               
#      1 "Header 1 " "Header 2" "Header 3"      
#      2 "A"         "12.23"    "blah blah"     
#      3 "B"         "1.23"     "blah blah blah"
#      4 "B"         "9.0"      "Salut"         
#      5 "C"         "6"        "Hello"