Chapter 7 Extract content
7.1 Import Word document
The function docx_summary()
returns the content of a Word document.
library(officer)
<- system.file(package = "officer", "doc_examples/example.docx")
example_docx <- read_docx(example_docx)
doc <- docx_summary(doc)
content head(content)
doc_index | content_type | style_name | text | level | num_id | row_id | is_header | cell_id | col_span | row_span |
---|---|---|---|---|---|---|---|---|---|---|
integer | character | character | character | numeric | integer | integer | logical | numeric | numeric | integer |
1 | paragraph | heading 1 | Title 1 | |||||||
2 | paragraph | Lorem ipsum dolor sit amet, consectetur adipiscing elit. | ||||||||
3 | paragraph | heading 1 | Title 2 | |||||||
4 | paragraph | List Paragraph | Quisque tristique | 1 | 2 | |||||
5 | paragraph | List Paragraph | Augue nisi, et convallis | 1 | 2 | |||||
6 | paragraph | List Paragraph | Sapien mollis nec. | 1 | 2 | |||||
n: 6 |
Explore the results:
tapply(content$doc_index,
$content_type,
contentfunction(x) length(unique(x)))
# paragraph table cell
# 17 1
To get all paragraphs:
<- subset(content, content_type %in% "paragraph")
par_data <- par_data[, c("doc_index", "style_name",
par_data "text", "level", "num_id") ]
$text <- with(par_data, {
par_datasubstr(
start = 1,
text, stop = ifelse(nchar(text)<30, nchar(text), 30) )
}) par_data
doc_index | style_name | text | level | num_id |
---|---|---|---|---|
integer | character | character | numeric | integer |
1 | heading 1 | Title 1 | ||
2 | Lorem ipsum dolor sit amet, co | |||
3 | heading 1 | Title 2 | ||
4 | List Paragraph | Quisque tristique | 1 | 2 |
5 | List Paragraph | Augue nisi, et convallis | 1 | 2 |
6 | List Paragraph | Sapien mollis nec. | 1 | 2 |
7 | heading 2 | Sub title 1 | ||
8 | List Paragraph | Quisque tristique | 1 | 1 |
9 | List Paragraph | Augue nisi, et convallis | 1 | 1 |
10 | List Paragraph | Sapien mollis nec. | 1 | 1 |
11 | ||||
12 | Phasellus nec nunc vitae nulla | |||
13 | heading 2 | Sub title 2 | ||
14 | Morbi rhoncus sapien sit amet | |||
15 | ||||
17 | ||||
18 | ||||
n: 17 |
There is no support to extract images stored in Word paragraphs.
7.1.1 Word tables
Tables are unstacked:
<- subset(content, content_type %in% "table cell")
table_cells print(head( table_cells) )
# doc_index content_type style_name text level num_id row_id
# 1.1 16 table cell Light Shading Petals NA NA 1
# 1.11 16 table cell Light Shading 5,621498349 NA NA 2
# 1.12 16 table cell Light Shading 4,994616997 NA NA 3
# 1.13 16 table cell Light Shading 4,767504884 NA NA 4
# 1.14 16 table cell Light Shading 25,9242382 NA NA 5
# 1.15 16 table cell Light Shading 6,489375001 NA NA 6
# is_header cell_id col_span row_span
# 1.1 TRUE 1 1 1
# 1.11 FALSE 1 2 1
# 1.12 FALSE 1 1 1
# 1.13 FALSE 1 1 1
# 1.14 FALSE 1 2 1
# 1.15 FALSE 1 1 1
Cells positions and values are dispatched in columns row_id
, cell_id
, text
and is_header
(a logical column indicating if the cell is part of a header or
not). Note that the content itself (column text
) is a character vector.
<- subset(table_cells, !is_header)
table_body <- table_body[,c("row_id", "cell_id", "text")]
table_body head(table_body)
row_id | cell_id | text |
---|---|---|
integer | numeric | character |
2 | 1 | 5,621498349 |
3 | 1 | 4,994616997 |
4 | 1 | 4,767504884 |
5 | 1 | 25,9242382 |
6 | 1 | 6,489375001 |
7 | 1 | 5,7858682 |
n: 6 |
Reshaping the data with columns row_id
, cell_id
and text
would display
something close to the orginal table:
tapply(table_body$text,
list(row_id = table_body$row_id,
cell_id = table_body$cell_id
), FUN = I
)
# cell_id
# row_id 1 2 3
# 2 "5,621498349" NA "2,46210657918,2034091"
# 3 "4,994616997" "AA" "2,429320759"
# 4 "4,767504884" NA "AAA"
# 5 "25,9242382" NA "2,066051345"
# 6 "6,489375001" "25,21130805" "2,901582763"
# 7 "5,7858682" "25,52433147" "2,655642742"
# 8 "5,645575295" "Merged cell" "2,278691288"
# 9 "4,828953215" NA "2,238467716"
# 10 "6,783500773" NA "2,202762147"
# 11 "5,395076839" NA "2,538375992"
# 12 "4,683617783" "29,2459239" "2,601945544"
# 13 "NoteNew line note" NA NA
# cell_id
# row_id 4
# 2 NA
# 3 "17,65204912"
# 4 NA
# 5 "18,37915478"
# 6 "17,3130473717,0721572418,2902189"
# 7 NA
# 8 NA
# 9 "19,87376227"
# 10 "19,85326662"
# 11 "19,56545356"
# 12 "18,95335451"
# 13 NA
Getting headers requires another operation:
<- subset(table_cells, is_header)
data <- data[, c("row_id", "cell_id", "text") ]
data
tapply(data$text,
list(row_id = data$row_id,
cell_id = data$cell_id
FUN = I ) ),
# cell_id
# row_id 1 2 3 4
# 1 "Petals" "Internode" "Sepal" "Bract"
7.2 Import PowerPoint document
The function pptx_summary()
returns the content of a PowerPoint document.
<- system.file(package = "officer", "doc_examples/example.pptx")
example_pptx file.copy(example_pptx, to = "static/reports/example.pptx")
# [1] FALSE
<- read_pptx("static/reports/example.pptx")
doc <- pptx_summary(doc)
content head(content)
text | id | content_type | slide_id | row_id | cell_id | col_span | row_span | media_file |
---|---|---|---|---|---|---|---|---|
character | character | character | integer | integer | integer | integer | integer | character |
Title | 12 | paragraph | 1 | |||||
A table | 13 | paragraph | 1 | |||||
and some text | 13 | paragraph | 1 | |||||
and some list (1) | 13 | paragraph | 1 | |||||
and some list (2) | 13 | paragraph | 1 | |||||
Header 1 | 18 | table cell | 1 | 1 | 1 | 1 | 1 | |
n: 6 |
Explore the results:
tapply(content$id,
$content_type,
contentfunction(x) length(unique(x)))
# image paragraph table cell
# 1 5 2
To get all paragraphs:
<- subset(content,
par_data %in% "paragraph",
content_type select = c(id, text) )
head(par_data)
id | text |
---|---|
character | character |
12 | Title |
13 | A table |
13 | and some text |
13 | and some list (1) |
13 | and some list (2) |
15 | R logo |
n: 6 |
To get an image:
<- subset(content, content_type %in% "image")
image_row <- file.path(tempdir(), "extract.png")
img media_extract(doc, path = image_row$media_file, target = img)
# [1] TRUE
7.2.1 PowerPoint tables
Tables are unstacked :
<- subset(content, content_type %in% "table cell")
table_cells head(table_cells)
text | id | content_type | slide_id | row_id | cell_id | col_span | row_span | media_file |
---|---|---|---|---|---|---|---|---|
character | character | character | integer | integer | integer | integer | integer | character |
Header 1 | 18 | table cell | 1 | 1 | 1 | 1 | 1 | |
A | 18 | table cell | 1 | 2 | 1 | 1 | 1 | |
B | 18 | table cell | 1 | 3 | 1 | 1 | 1 | |
B | 18 | table cell | 1 | 4 | 1 | 1 | 1 | |
C | 18 | table cell | 1 | 5 | 1 | 1 | 1 | |
Header 2 | 18 | table cell | 1 | 1 | 2 | 1 | 1 | |
n: 6 |
Cells positions and values are dispatched in columns row_id
, cell_id
, text
.
Note that here there is no indicator for the table header.
<- subset(table_cells, id == 18, c(row_id, cell_id, text) )
data tapply(data$text,
list(row_id = data$row_id,
cell_id = data$cell_id
FUN = I ) ),
# cell_id
# row_id 1 2 3
# 1 "Header 1 " "Header 2" "Header 3"
# 2 "A" "12.23" "blah blah"
# 3 "B" "1.23" "blah blah blah"
# 4 "B" "9.0" "Salut"
# 5 "C" "6" "Hello"