Chapter 7 Extract content

7.1 Import Word document

The function docx_summary() returns the content of a Word document.

library(officer)
example_docx <- system.file(package = "officer", "doc_examples/example.docx")
doc <- read_docx(example_docx)
content <- docx_summary(doc)
head(content)

doc_index	content_type	style_name	text	level	num_id	row_id	is_header	cell_id	col_span	row_span
integer	character	character	character	numeric	integer	integer	logical	numeric	numeric	integer
1	paragraph	heading 1	Title 1
2	paragraph		Lorem ipsum dolor sit amet, consectetur adipiscing elit.
3	paragraph	heading 1	Title 2
4	paragraph	List Paragraph	Quisque tristique	1	2
5	paragraph	List Paragraph	Augue nisi, et convallis	1	2
6	paragraph	List Paragraph	Sapien mollis nec.	1	2
n: 6

Explore the results:

tapply(content$doc_index, 
       content$content_type, 
       function(x) length(unique(x)))

#  paragraph table cell 
#         17          1

To get all paragraphs:

par_data <- subset(content, content_type %in% "paragraph") 
par_data <- par_data[, c("doc_index", "style_name", 
                         "text", "level", "num_id") ]
par_data$text <- with(par_data, {
  substr(
    text, start = 1, 
    stop = ifelse(nchar(text)<30, nchar(text), 30) )
})
par_data

doc_index	style_name	text	level	num_id
integer	character	character	numeric	integer
1	heading 1	Title 1
2		Lorem ipsum dolor sit amet, co
3	heading 1	Title 2
4	List Paragraph	Quisque tristique	1	2
5	List Paragraph	Augue nisi, et convallis	1	2
6	List Paragraph	Sapien mollis nec.	1	2
7	heading 2	Sub title 1
8	List Paragraph	Quisque tristique	1	1
9	List Paragraph	Augue nisi, et convallis	1	1
10	List Paragraph	Sapien mollis nec.	1	1
11
12		Phasellus nec nunc vitae nulla
13	heading 2	Sub title 2
14		Morbi rhoncus sapien sit amet
15
17
18
n: 17

There is no support to extract images stored in Word paragraphs.

7.1.1 Word tables

Tables are unstacked:

table_cells <- subset(content, content_type %in% "table cell")
print(head( table_cells) )

#      doc_index content_type    style_name        text level num_id row_id
# 1.1         16   table cell Light Shading      Petals    NA     NA      1
# 1.11        16   table cell Light Shading 5,621498349    NA     NA      2
# 1.12        16   table cell Light Shading 4,994616997    NA     NA      3
# 1.13        16   table cell Light Shading 4,767504884    NA     NA      4
# 1.14        16   table cell Light Shading  25,9242382    NA     NA      5
# 1.15        16   table cell Light Shading 6,489375001    NA     NA      6
#      is_header cell_id col_span row_span
# 1.1       TRUE       1        1        1
# 1.11     FALSE       1        2        1
# 1.12     FALSE       1        1        1
# 1.13     FALSE       1        1        1
# 1.14     FALSE       1        2        1
# 1.15     FALSE       1        1        1

Cells positions and values are dispatched in columns row_id, cell_id, text and is_header (a logical column indicating if the cell is part of a header or not). Note that the content itself (column text) is a character vector.

table_body <- subset(table_cells, !is_header)
table_body <- table_body[,c("row_id", "cell_id", "text")]
head(table_body)

row_id	cell_id	text
integer	numeric	character
2	1	5,621498349
3	1	4,994616997
4	1	4,767504884
5	1	25,9242382
6	1	6,489375001
7	1	5,7858682
n: 6

Reshaping the data with columns row_id, cell_id and text would display something close to the orginal table:

tapply(table_body$text, 
       list(row_id = table_body$row_id, 
            cell_id = table_body$cell_id
            ), 
       FUN = I
       )

#       cell_id
# row_id 1                   2             3                      
#     2  "5,621498349"       NA            "2,46210657918,2034091"
#     3  "4,994616997"       "AA"          "2,429320759"          
#     4  "4,767504884"       NA            "AAA"                  
#     5  "25,9242382"        NA            "2,066051345"          
#     6  "6,489375001"       "25,21130805" "2,901582763"          
#     7  "5,7858682"         "25,52433147" "2,655642742"          
#     8  "5,645575295"       "Merged cell" "2,278691288"          
#     9  "4,828953215"       NA            "2,238467716"          
#     10 "6,783500773"       NA            "2,202762147"          
#     11 "5,395076839"       NA            "2,538375992"          
#     12 "4,683617783"       "29,2459239"  "2,601945544"          
#     13 "NoteNew line note" NA            NA                     
#       cell_id
# row_id 4                                 
#     2  NA                                
#     3  "17,65204912"                     
#     4  NA                                
#     5  "18,37915478"                     
#     6  "17,3130473717,0721572418,2902189"
#     7  NA                                
#     8  NA                                
#     9  "19,87376227"                     
#     10 "19,85326662"                     
#     11 "19,56545356"                     
#     12 "18,95335451"                     
#     13 NA

Getting headers requires another operation:

data <- subset(table_cells, is_header) 
data <- data[, c("row_id", "cell_id", "text") ] 

tapply(data$text, 
   list(row_id = data$row_id, 
        cell_id = data$cell_id
        ), FUN = I )

#       cell_id
# row_id 1        2           3       4      
#      1 "Petals" "Internode" "Sepal" "Bract"

7.2 Import PowerPoint document

The function pptx_summary() returns the content of a PowerPoint document.

example_pptx <- system.file(package = "officer", "doc_examples/example.pptx")
file.copy(example_pptx, to = "static/reports/example.pptx")

# [1] FALSE

static/reports/example.pptx

doc <- read_pptx("static/reports/example.pptx")
content <- pptx_summary(doc)
head(content)

text	id	content_type	slide_id	row_id	cell_id	col_span	row_span	media_file
character	character	character	integer	integer	integer	integer	integer	character
Title	12	paragraph	1
A table	13	paragraph	1
and some text	13	paragraph	1
and some list (1)	13	paragraph	1
and some list (2)	13	paragraph	1
Header 1	18	table cell	1	1	1	1	1
n: 6

Explore the results:

tapply(content$id, 
       content$content_type, 
       function(x) length(unique(x)))

#      image  paragraph table cell 
#          1          5          2

To get all paragraphs:

par_data <- subset(content, 
                   content_type %in% "paragraph", 
                   select = c(id, text) )
head(par_data)

id	text
character	character
12	Title
13	A table
13	and some text
13	and some list (1)
13	and some list (2)
15	R logo
n: 6

To get an image:

image_row <- subset(content, content_type %in% "image")
img  <- file.path(tempdir(), "extract.png")
media_extract(doc, path = image_row$media_file, target = img)

# [1] TRUE

7.2.1 PowerPoint tables

Tables are unstacked :

table_cells <- subset(content, content_type %in% "table cell")
head(table_cells)

text	id	content_type	slide_id	row_id	cell_id	col_span	row_span	media_file
character	character	character	integer	integer	integer	integer	integer	character
Header 1	18	table cell	1	1	1	1	1
A	18	table cell	1	2	1	1	1
B	18	table cell	1	3	1	1	1
B	18	table cell	1	4	1	1	1
C	18	table cell	1	5	1	1	1
Header 2	18	table cell	1	1	2	1	1
n: 6

Cells positions and values are dispatched in columns row_id, cell_id, text. Note that here there is no indicator for the table header.

data <- subset(table_cells, id == 18, c(row_id, cell_id, text) )
tapply(data$text, 
   list(row_id = data$row_id, 
        cell_id = data$cell_id
        ), FUN = I )

#       cell_id
# row_id 1           2          3               
#      1 "Header 1 " "Header 2" "Header 3"      
#      2 "A"         "12.23"    "blah blah"     
#      3 "B"         "1.23"     "blah blah blah"
#      4 "B"         "9.0"      "Salut"         
#      5 "C"         "6"        "Hello"