Chapter 7 Extract content

7.1 Import Word document

The function docx_summary() returns the content of a Word document.

library(officer)
example_docx <- system.file(package = "officer", "doc_examples/example.docx")
doc <- read_docx(example_docx)
content <- docx_summary(doc)
content

doc_index

content_type

style_name

text

table_index

row_id

cell_id

is_header

row_span

col_span

table_stylename

integer

character

character

character

integer

integer

integer

logical

integer

character

character

1

paragraph

heading 1

Title 1

2

paragraph

Lorem ipsum dolor sit amet, consectetur adipiscing elit.

3

paragraph

heading 1

Title 2

4

paragraph

List Paragraph

Quisque tristique

5

paragraph

List Paragraph

Augue nisi, et convallis

6

paragraph

List Paragraph

Sapien mollis nec.

7

paragraph

heading 2

Sub title 1

8

paragraph

List Paragraph

Quisque tristique

9

paragraph

List Paragraph

Augue nisi, et convallis

10

paragraph

List Paragraph

Sapien mollis nec.

12

paragraph

Phasellus nec nunc vitae nulla interdum volutpat eu ac massa.

13

paragraph

heading 2

Sub title 2

14

paragraph

Morbi rhoncus sapien sit amet leo eleifend, vel fermentum nisi mattis.

16

table cell

Petals

1

1

1

true

1

1

Light Shading

17

table cell

Internode

1

1

2

true

1

1

Light Shading

18

table cell

Sepal

1

1

3

true

1

1

Light Shading

19

table cell

Bract

1

1

4

true

1

1

Light Shading

20

table cell

5,621498349

1

2

1

false

1

2

Light Shading

21

table cell

2,462106579

1

2

3

false

1

2

Light Shading

22

table cell

18,2034091

1

2

4

false

1

2

Light Shading

23

table cell

4,994616997

1

3

1

false

1

1

Light Shading

24

table cell

AA

1

3

2

false

2

1

Light Shading

25

table cell

2,429320759

1

3

3

false

1

1

Light Shading

26

table cell

17,65204912

1

3

4

false

1

1

Light Shading

27

table cell

4,767504884

1

4

1

false

1

1

Light Shading

29

table cell

AAA

1

4

3

false

1

2

Light Shading

30

table cell

25,9242382

1

5

1

false

1

2

Light Shading

31

table cell

2,066051345

1

5

3

false

1

1

Light Shading

32

table cell

18,37915478

1

5

4

false

1

1

Light Shading

33

table cell

6,489375001

1

6

1

false

1

1

Light Shading

34

table cell

25,21130805

1

6

2

false

1

1

Light Shading

35

table cell

2,901582763

1

6

3

false

1

1

Light Shading

36

table cell

17,31304737

1

6

4

false

1

1

Light Shading

37

table cell

17,07215724

1

6

4

false

1

1

Light Shading

38

table cell

18,2902189

1

6

4

false

3

1

Light Shading

39

table cell

5,7858682

1

7

1

false

1

1

Light Shading

40

table cell

25,52433147

1

7

2

false

1

1

Light Shading

41

table cell

2,655642742

1

7

3

false

1

1

Light Shading

43

table cell

5,645575295

1

8

1

false

1

1

Light Shading

44

table cell

Merged cell

1

8

2

false

4

1

Light Shading

45

table cell

2,278691288

1

8

3

false

1

1

Light Shading

47

table cell

4,828953215

1

9

1

false

1

1

Light Shading

49

table cell

2,238467716

1

9

3

false

1

1

Light Shading

50

table cell

19,87376227

1

9

4

false

1

1

Light Shading

51

table cell

6,783500773

1

10

1

false

1

1

Light Shading

53

table cell

2,202762147

1

10

3

false

1

1

Light Shading

54

table cell

19,85326662

1

10

4

false

1

1

Light Shading

55

table cell

5,395076839

1

11

1

false

1

1

Light Shading

57

table cell

2,538375992

1

11

3

false

1

1

Light Shading

58

table cell

19,56545356

1

11

4

false

1

1

Light Shading

59

table cell

4,683617783

1

12

1

false

1

1

Light Shading

60

table cell

29,2459239

1

12

2

false

1

1

Light Shading

61

table cell

2,601945544

1

12

3

false

1

1

Light Shading

62

table cell

18,95335451

1

12

4

false

1

1

Light Shading

63

table cell

Note

1

13

1

false

1

4

Light Shading

64

table cell

New line note

1

13

4

false

1

4

Light Shading

n: 56

Explore the results:

count(content, content_type)

content_type

n

character

integer

paragraph

13

table cell

43

n: 2

To get all paragraphs:

filter(content, content_type %in% "paragraph") |>
  select(doc_index, style_name, text)

doc_index

style_name

text

integer

character

character

1

heading 1

Title 1

2

Lorem ipsum dolor sit amet, consectetur adipiscing elit.

3

heading 1

Title 2

4

List Paragraph

Quisque tristique

5

List Paragraph

Augue nisi, et convallis

6

List Paragraph

Sapien mollis nec.

7

heading 2

Sub title 1

8

List Paragraph

Quisque tristique

9

List Paragraph

Augue nisi, et convallis

10

List Paragraph

Sapien mollis nec.

12

Phasellus nec nunc vitae nulla interdum volutpat eu ac massa.

13

heading 2

Sub title 2

14

Morbi rhoncus sapien sit amet leo eleifend, vel fermentum nisi mattis.

n: 13

7.1.1 Run-level extraction

By default, docx_summary() aggregates content at the paragraph (or table cell) level: one row per paragraph, one column text with the concatenated content. Passing detailed = TRUE switches the function to a run-level data.frame: one row per run, with additional columns exposing text formatting, hyperlinks, bookmarks, footnotes, field codes and embedded image paths.

Both dplyr and tidyr must be available when detailed = TRUE.

To illustrate features that are not present in the bundled example.docx, the chunk below prepares a companion document with a bookmark, an external hyperlink, an internal link pointing to that bookmark, and a footnote.

doc_detailed <- read_docx(path = "static/office/example_detailed.docx")
runs <- docx_summary(doc_detailed, detailed = TRUE)
runs

doc_index

content_type

run_index

run_content_index

run_content_text

image_path

field_code

footnote_text

link

link_to_bookmark

bookmark_start

character_stylename

sz

sz_cs

font_family_ascii

font_family_eastasia

font_family_hansi

font_family_cs

bold

italic

underline

color

shading

shading_color

shading_fill

paragraph_stylename

keep_with_next

align

level

num_id

table_index

row_id

cell_id

col_span

row_span

is_header

table_stylename

integer

character

integer

integer

character

character

character

character

character

character

character

character

integer

integer

character

character

character

character

logical

logical

logical

character

character

character

character

character

logical

character

integer

integer

integer

integer

integer

character

integer

logical

character

1

paragraph

1

1

See

false

false

false

Normal

false

1

paragraph

2

1

the officer website

https://ardata-fr.github.io/officeverse/

false

false

false

Normal

false

1

paragraph

3

1

or jump to the

false

false

false

Normal

false

1

paragraph

4

1

anchor below

anchor

false

false

false

Normal

false

1

paragraph

5

1

for more details.

false

false

false

Normal

false

2

paragraph

1

1

A paragraph with a

false

false

false

Normal

false

2

paragraph

2

1

This is a footnote.

20

20

Arial

Arial

Arial

Arial

false

false

false

#000000

Normal

false

2

paragraph

3

1

footnote reference.

false

false

false

Normal

false

3

paragraph

1

1

anchor

false

false

false

Normal

false

4

paragraph

1

1

Target of the internal link.

false

false

false

Normal

false

n: 10

External hyperlinks surface in the link column, internal links in link_to_bookmark:

runs |>
  filter(!is.na(link) | !is.na(link_to_bookmark)) |>
  select(doc_index, run_content_text, link, link_to_bookmark)

doc_index

run_content_text

link

link_to_bookmark

integer

character

character

character

1

the officer website

https://ardata-fr.github.io/officeverse/

1

anchor below

anchor

n: 2

Bookmarks starting on a paragraph are concatenated (with |) in bookmark_start:

runs |>
  filter(!is.na(bookmark_start)) |>
  select(doc_index, bookmark_start, run_content_text)

doc_index
integer

3

bookmark_start
character

anchor

run_content_text
character

Footnote contents are carried in footnote_text:

runs |>
  filter(!is.na(footnote_text)) |>
  select(doc_index, run_content_text, footnote_text)

doc_index
integer

2

run_content_text
character

footnote_text
character

This is a footnote.

Run-level formatting (bold, italic, underline, color, character style, font size in half-points, font families, paragraph alignment, numbering level) is exposed in dedicated columns. For example, on the bundled example.docx:

runs_ex <- docx_summary(doc, detailed = TRUE)
runs_ex |>
  filter(bold %in% TRUE | italic %in% TRUE) |>
  select(doc_index, run_content_text, bold, italic, color, sz) 

doc_index

run_content_text

bold

italic

color

sz

integer

character

logical

logical

character

integer

n: 0

Embedded images are extracted to the temporary directory of the rdocx object and their path is available in the image_path column. If the run refers to an image, the other text-oriented columns are NA. Copy the files to a permanent location before the R session ends if you need them afterwards.

7.1.2 Word tables

Tables are unstacked:

table_cells <- subset(content, content_type %in% "table cell")
table_cells

doc_index

content_type

style_name

text

table_index

row_id

cell_id

is_header

row_span

col_span

table_stylename

integer

character

character

character

integer

integer

integer

logical

integer

character

character

16

table cell

Petals

1

1

1

true

1

1

Light Shading

17

table cell

Internode

1

1

2

true

1

1

Light Shading

18

table cell

Sepal

1

1

3

true

1

1

Light Shading

19

table cell

Bract

1

1

4

true

1

1

Light Shading

20

table cell

5,621498349

1

2

1

false

1

2

Light Shading

21

table cell

2,462106579

1

2

3

false

1

2

Light Shading

22

table cell

18,2034091

1

2

4

false

1

2

Light Shading

23

table cell

4,994616997

1

3

1

false

1

1

Light Shading

24

table cell

AA

1

3

2

false

2

1

Light Shading

25

table cell

2,429320759

1

3

3

false

1

1

Light Shading

26

table cell

17,65204912

1

3

4

false

1

1

Light Shading

27

table cell

4,767504884

1

4

1

false

1

1

Light Shading

29

table cell

AAA

1

4

3

false

1

2

Light Shading

30

table cell

25,9242382

1

5

1

false

1

2

Light Shading

31

table cell

2,066051345

1

5

3

false

1

1

Light Shading

32

table cell

18,37915478

1

5

4

false

1

1

Light Shading

33

table cell

6,489375001

1

6

1

false

1

1

Light Shading

34

table cell

25,21130805

1

6

2

false

1

1

Light Shading

35

table cell

2,901582763

1

6

3

false

1

1

Light Shading

36

table cell

17,31304737

1

6

4

false

1

1

Light Shading

37

table cell

17,07215724

1

6

4

false

1

1

Light Shading

38

table cell

18,2902189

1

6

4

false

3

1

Light Shading

39

table cell

5,7858682

1

7

1

false

1

1

Light Shading

40

table cell

25,52433147

1

7

2

false

1

1

Light Shading

41

table cell

2,655642742

1

7

3

false

1

1

Light Shading

43

table cell

5,645575295

1

8

1

false

1

1

Light Shading

44

table cell

Merged cell

1

8

2

false

4

1

Light Shading

45

table cell

2,278691288

1

8

3

false

1

1

Light Shading

47

table cell

4,828953215

1

9

1

false

1

1

Light Shading

49

table cell

2,238467716

1

9

3

false

1

1

Light Shading

50

table cell

19,87376227

1

9

4

false

1

1

Light Shading

51

table cell

6,783500773

1

10

1

false

1

1

Light Shading

53

table cell

2,202762147

1

10

3

false

1

1

Light Shading

54

table cell

19,85326662

1

10

4

false

1

1

Light Shading

55

table cell

5,395076839

1

11

1

false

1

1

Light Shading

57

table cell

2,538375992

1

11

3

false

1

1

Light Shading

58

table cell

19,56545356

1

11

4

false

1

1

Light Shading

59

table cell

4,683617783

1

12

1

false

1

1

Light Shading

60

table cell

29,2459239

1

12

2

false

1

1

Light Shading

61

table cell

2,601945544

1

12

3

false

1

1

Light Shading

62

table cell

18,95335451

1

12

4

false

1

1

Light Shading

63

table cell

Note

1

13

1

false

1

4

Light Shading

64

table cell

New line note

1

13

4

false

1

4

Light Shading

n: 43

Cells positions and values are dispatched in columns row_id, cell_id, text and is_header (a logical column indicating if the cell is part of a header or not). Note that the content itself (column text) is a character vector.

table_body <- subset(table_cells, !is_header)
table_body <- table_body[,c("row_id", "cell_id", "text")]
table_body

row_id

cell_id

text

integer

integer

character

2

1

5,621498349

2

3

2,462106579

2

4

18,2034091

3

1

4,994616997

3

2

AA

3

3

2,429320759

3

4

17,65204912

4

1

4,767504884

4

3

AAA

5

1

25,9242382

5

3

2,066051345

5

4

18,37915478

6

1

6,489375001

6

2

25,21130805

6

3

2,901582763

6

4

17,31304737

6

4

17,07215724

6

4

18,2902189

7

1

5,7858682

7

2

25,52433147

7

3

2,655642742

8

1

5,645575295

8

2

Merged cell

8

3

2,278691288

9

1

4,828953215

9

3

2,238467716

9

4

19,87376227

10

1

6,783500773

10

3

2,202762147

10

4

19,85326662

11

1

5,395076839

11

3

2,538375992

11

4

19,56545356

12

1

4,683617783

12

2

29,2459239

12

3

2,601945544

12

4

18,95335451

13

1

Note

13

4

New line note

n: 39

Reshaping the data with columns row_id, cell_id and text would display something close to the orginal table:

tapply(table_body$text, 
       list(row_id = table_body$row_id, 
            cell_id = table_body$cell_id
            ), 
       FUN = I
       )
#       cell_id
# row_id 1           2           3           4            
#     2  5,621498349 NULL        2,462106579 18,2034091   
#     3  4,994616997 AA          2,429320759 17,65204912  
#     4  4,767504884 NULL        AAA         NULL         
#     5  25,9242382  NULL        2,066051345 18,37915478  
#     6  6,489375001 25,21130805 2,901582763 AsIs,3       
#     7  5,7858682   25,52433147 2,655642742 NULL         
#     8  5,645575295 Merged cell 2,278691288 NULL         
#     9  4,828953215 NULL        2,238467716 19,87376227  
#     10 6,783500773 NULL        2,202762147 19,85326662  
#     11 5,395076839 NULL        2,538375992 19,56545356  
#     12 4,683617783 29,2459239  2,601945544 18,95335451  
#     13 Note        NULL        NULL        New line note

Getting headers requires another operation:

data <- subset(table_cells, is_header) 
data <- data[, c("row_id", "cell_id", "text") ] 

tapply(data$text, 
   list(row_id = data$row_id, 
        cell_id = data$cell_id
        ), FUN = I )
#       cell_id
# row_id 1        2           3       4      
#      1 "Petals" "Internode" "Sepal" "Bract"

7.2 Import PowerPoint document

The function pptx_summary() returns the content of a PowerPoint document.

example_pptx <- system.file(package = "officer", "doc_examples/example.pptx")
file.copy(example_pptx, to = "static/office/example.pptx")
# [1] TRUE

doc <- read_pptx("static/office/example.pptx")
content <- pptx_summary(doc)
head(content)

text

id

content_type

slide_id

row_id

cell_id

col_span

row_span

media_file

character

character

character

integer

integer

integer

integer

integer

character

Title

12

paragraph

1

A table

13

paragraph

1

and some text

13

paragraph

1

and some list (1)

13

paragraph

1

and some list (2)

13

paragraph

1

Header 1

18

table cell

1

1

1

1

1

n: 6

Explore the results:

tapply(content$id, 
       content$content_type, 
       function(x) length(unique(x)))
#      image  paragraph table cell 
#          1          5          2

To get all paragraphs:

par_data <- subset(content, 
                   content_type %in% "paragraph", 
                   select = c(id, text) )
head(par_data)

id

text

character

character

12

Title

13

A table

13

and some text

13

and some list (1)

13

and some list (2)

15

R logo

n: 6

To get an image:

image_row <- subset(content, content_type %in% "image")
img  <- file.path(tempdir(), "extract.png")
media_extract(doc, path = image_row$media_file, target = img)
# [1] TRUE

7.2.1 PowerPoint tables

Tables are unstacked :

table_cells <- subset(content, content_type %in% "table cell")
head(table_cells)

text

id

content_type

slide_id

row_id

cell_id

col_span

row_span

media_file

character

character

character

integer

integer

integer

integer

integer

character

Header 1

18

table cell

1

1

1

1

1

A

18

table cell

1

2

1

1

1

B

18

table cell

1

3

1

1

1

B

18

table cell

1

4

1

1

1

C

18

table cell

1

5

1

1

1

Header 2

18

table cell

1

1

2

1

1

n: 6

Cells positions and values are dispatched in columns row_id, cell_id, text. Note that here there is no indicator for the table header.

data <- subset(table_cells, id == 18, c(row_id, cell_id, text) )
tapply(data$text, 
   list(row_id = data$row_id, 
        cell_id = data$cell_id
        ), FUN = I )
#       cell_id
# row_id 1           2          3               
#      1 "Header 1 " "Header 2" "Header 3"      
#      2 "A"         "12.23"    "blah blah"     
#      3 "B"         "1.23"     "blah blah blah"
#      4 "B"         "9.0"      "Salut"         
#      5 "C"         "6"        "Hello"