Read in .docx Content

Usage

read_docx(file, skip = 0)

Arguments

file
The path to the .docx file.
skip
The number of lines to skip.

Value

Returns a character vector.

Description

Read in the content from a .docx file.

Examples

## <strong>Not run</strong>: # ## Mining Citation # url_dl("http://umlreading.weebly.com/uploads/2/5/2/5/25253346/whole_language_timeline-updated.docx") # # (txt <- read_docx("whole_language_timeline-updated.docx")) # # library(qdapTools); library(ggplot2); library(qdap) # txt <- rm_non_ascii(txt) # # parts <- split_vector(txt, split = "References", include = TRUE, regex=TRUE) # # parts[[1]] # # rm_citation(unbag(parts[[1]]), extract=TRUE)[[1]] # # ## By line # rm_citation(parts[[1]], extract=TRUE) # # ## Frequency # left_just(cites <- list2df(sort(table(rm_citation(unbag(parts[[1]]), # extract=TRUE)), T), "freq", "citation")[2:1]) # # ## Distribution of citations (find locations and then plot) # cite_locs <- do.call(rbind, lapply(cites[[1]], function(x){ # m <- gregexpr(x, unbag(parts[[1]]), fixed=TRUE) # data.frame( # citation=x, # start = m[[1]] -5, # end = m[[1]] + 5 + attributes(m[[1]])[["match.length"]] # ) # })) # # ggplot(cite_locs) + # geom_segment(aes(x=start, xend=end, y=citation, yend=citation), size=3, # color="yellow") + # xlab("Duration") + # scale_x_continuous(expand = c(0,0), # limits = c(0, nchar(unbag(parts[[1]])) + 25)) + # theme_grey() + # theme( # panel.grid.major=element_line(color="grey20"), # panel.grid.minor=element_line(color="grey20"), # plot.background = element_rect(fill="black"), # panel.background = element_rect(fill="black"), # panel.border = element_rect(colour = "grey50", fill=NA, size=1), # axis.text=element_text(color="grey50"), # axis.title=element_text(color="grey50") # ) # ## <strong>End(Not run)</strong>

Author

Bryan Goodrich and Tyler Rinker .