Chapter 7 Extract content
7.1 Import Word document
The function docx_summary()
returns the content of a Word document.
library(officer)
<- system.file(package = "officer", "doc_examples/example.docx")
example_docx <- read_docx(example_docx)
doc <- docx_summary(doc)
content head(content)
Explore the results:
tapply(content$doc_index,
$content_type,
contentfunction(x) length(unique(x)))
# paragraph table cell
# 17 1
To get all paragraphs:
<- subset(content, content_type %in% "paragraph")
par_data <- par_data[, c("doc_index", "style_name",
par_data "text", "level", "num_id") ]
$text <- with(par_data, {
par_datasubstr(
start = 1,
text, stop = ifelse(nchar(text)<30, nchar(text), 30) )
}) par_data
There is no support to extract images stored in Word paragraphs.
7.1.1 Word tables
Tables are unstacked:
<- subset(content, content_type %in% "table cell")
table_cells print(head( table_cells) )
# doc_index content_type style_name text level num_id row_id
# 1.1 16 table cell Light Shading Petals NA NA 1
# 1.11 16 table cell Light Shading 5,621498349 NA NA 2
# 1.12 16 table cell Light Shading 4,994616997 NA NA 3
# 1.13 16 table cell Light Shading 4,767504884 NA NA 4
# 1.14 16 table cell Light Shading 25,9242382 NA NA 5
# 1.15 16 table cell Light Shading 6,489375001 NA NA 6
# is_header cell_id col_span row_span
# 1.1 TRUE 1 1 1
# 1.11 FALSE 1 2 1
# 1.12 FALSE 1 1 1
# 1.13 FALSE 1 1 1
# 1.14 FALSE 1 2 1
# 1.15 FALSE 1 1 1
Cells positions and values are dispatched in columns row_id
, cell_id
, text
and is_header
(a logical column indicating if the cell is part of a header or
not). Note that the content itself (column text
) is a character vector.
<- subset(table_cells, !is_header)
table_body <- table_body[,c("row_id", "cell_id", "text")]
table_body head(table_body)
Reshaping the data with columns row_id
, cell_id
and text
would display
something close to the orginal table:
tapply(table_body$text,
list(row_id = table_body$row_id,
cell_id = table_body$cell_id
), FUN = I
)
# cell_id
# row_id 1 2 3
# 2 "5,621498349" NA "2,46210657918,2034091"
# 3 "4,994616997" "AA" "2,429320759"
# 4 "4,767504884" NA "AAA"
# 5 "25,9242382" NA "2,066051345"
# 6 "6,489375001" "25,21130805" "2,901582763"
# 7 "5,7858682" "25,52433147" "2,655642742"
# 8 "5,645575295" "Merged cell" "2,278691288"
# 9 "4,828953215" NA "2,238467716"
# 10 "6,783500773" NA "2,202762147"
# 11 "5,395076839" NA "2,538375992"
# 12 "4,683617783" "29,2459239" "2,601945544"
# 13 "NoteNew line note" NA NA
# cell_id
# row_id 4
# 2 NA
# 3 "17,65204912"
# 4 NA
# 5 "18,37915478"
# 6 "17,3130473717,0721572418,2902189"
# 7 NA
# 8 NA
# 9 "19,87376227"
# 10 "19,85326662"
# 11 "19,56545356"
# 12 "18,95335451"
# 13 NA
Getting headers requires another operation:
<- subset(table_cells, is_header)
data <- data[, c("row_id", "cell_id", "text") ]
data
tapply(data$text,
list(row_id = data$row_id,
cell_id = data$cell_id
FUN = I ) ),
# cell_id
# row_id 1 2 3 4
# 1 "Petals" "Internode" "Sepal" "Bract"
7.2 Import PowerPoint document
The function pptx_summary()
returns the content of a PowerPoint document.
<- system.file(package = "officer", "doc_examples/example.pptx")
example_pptx file.copy(example_pptx, to = "static/reports/example.pptx")
# [1] FALSE
<- read_pptx("static/reports/example.pptx")
doc <- pptx_summary(doc)
content head(content)
Explore the results:
tapply(content$id,
$content_type,
contentfunction(x) length(unique(x)))
# image paragraph table cell
# 1 5 2
To get all paragraphs:
<- subset(content,
par_data %in% "paragraph",
content_type select = c(id, text) )
head(par_data)
To get an image:
<- subset(content, content_type %in% "image")
image_row <- file.path(tempdir(), "extract.png")
img media_extract(doc, path = image_row$media_file, target = img)
# [1] TRUE
7.2.1 PowerPoint tables
Tables are unstacked :
<- subset(content, content_type %in% "table cell")
table_cells head(table_cells)
Cells positions and values are dispatched in columns row_id
, cell_id
, text
.
Note that here there is no indicator for the table header.
<- subset(table_cells, id == 18, c(row_id, cell_id, text) )
data tapply(data$text,
list(row_id = data$row_id,
cell_id = data$cell_id
FUN = I ) ),
# cell_id
# row_id 1 2 3
# 1 "Header 1 " "Header 2" "Header 3"
# 2 "A" "12.23" "blah blah"
# 3 "B" "1.23" "blah blah blah"
# 4 "B" "9.0" "Salut"
# 5 "C" "6" "Hello"