wfm. qdap 2.2.0

Usage

wfm(text.var = NULL, grouping.var = NULL, output = "raw", stopwords = NULL, char2space = "~~", ...)
"wfm"(text.var = NULL, grouping.var = NULL, output = "raw", stopwords = NULL, char2space = "~~", ...)
"wfm"(text.var = NULL, grouping.var = NULL, output = "raw", stopwords = NULL, char2space = "~~", ...)
"wfm"(text.var = NULL, grouping.var = NULL, output = "raw", stopwords = NULL, char2space = "~~", ...)
wfdf(text.var, grouping.var = NULL, stopwords = NULL, margins = FALSE, output = "raw", digits = 2, char2space = "~~", ...)
wfm_expanded(text.var, grouping.var = NULL, ...)
wfm_combine(wf.obj, word.lists, matrix = TRUE)
"weight"(x, type = "prop", ...)
"weight"(x, type = "prop", ...)
as.wfm(x, ...)
"as.wfm"(x, ...)
"as.wfm"(x, ...)
"as.wfm"(x, ...)
"as.wfm"(x, ...)
"as.wfm"(x, ...)
"as.wfm"(x, ...)
"as.wfm"(x, col = "docs", row = "text", ...)
"wfm"(text.var, ...)

Arguments

text.var: The text variable.
grouping.var: The grouping variables. Default NULL generates one word list for all text. Also takes a single grouping variable or a list of 1 or more grouping variables.
output: Output type (either "proportion" or "percent").
stopwords: A vector of stop words to remove.
char2space: A vector of characters to be turned into spaces. If char.keep is NULL, char2space will activate this argument.
margins: logical. If TRUE provides grouping.var and word variable totals.
digits: An integer indicating the number of decimal places (round) or significant digits (signif) to be used. Negative values are allowed.
wf.obj: A wfm or wfdf object.
word.lists: A list of character vectors of words to pass to wfm_combine
matrix: logical. If TRUE returns the output as a wfm rather than a wfdf object.
x: An object with words for row names and integer values.
type: The type of weighting to use: c("prop", "max", "scaled"). All weight by column. "prop" uses a proportion weighting and all columns sum to 1. "max" weights in proportion to the max value; all values are integers and column sums may not be equal. "scaled" uses scale to scale with center = FALSE; output is not integer and column sums may not be equal.
col: The column name (generally not used).
row: The row name (generally not used).
...: Other arguments supplied to Corpus or TermDocumentMatrix. If as.wfm this is other arguments passed to as.wfm methods (currently ignored).

Word Frequency Matrix

Value

wfm - returns a word frequency of the class matrix.

wfdf - returns a word frequency of the class data.frame with a words column and optional margin sums.

wfm_expanded - returns a matrix similar to a word frequency matrix (wfm) but the rows are expanded to represent the maximum usages of the word and cells are dummy coded to indicate that number of uses.

wfm_combine - returns a word frequency matrix (wfm) or dataframe (wfdf) with counts for the combined word.lists merged and remaining terms (else).

weight - Returns a weighted matrix for use with other R packages. The output is not of the class "wfm".

as.wfm - Returns a matrix of the class "wfm".

Description

wfm - Generate a word frequency matrix by grouping variable(s).

wfm.wfdf - wfdf method for wfm.

wfm.character - character method for wfm.

wfm.factor - factor method for wfm.

wfdf - Generate a word frequency data frame by grouping variable.

wfm_expanded - Expand a word frequency matrix to have multiple rows for each word.

wfm_combine - Combines words (rows) of a word frequency matrix (wfdf) together.

weight - Weight a word frequency matrix for analysis where such weighting is sensible.

weight.wfdf - Weight a word frequency matrix for analysis where such weighting is sensible.

as.wfm - Attempts to coerce a matrix to a wfm.

as.wfm.matrix - matrix method for as.wfm used to convert matrices to a wfm.

as.wfm.default - Default method for as.wfm used to convert matrices to a wfm.

as.wfm.TermDocumentMatrix - TermDocumentMatrix method for as.wfm used to a TermDocumentMatrix to a wfm.

as.wfm.DocumentTermMatrix - DocumentTermMatrix method for as.wfm used to a DocumentTermMatrix to a wfm.

as.wfm.data.frame - data.frame method for as.wfm used to convert matrices to a wfm.

as.wfm.wfdf - wfdf method for as.wfm used to convert matrices to a wfm.

as.wfm.Corpus - Corpus method for as.wfm used to convert matrices to a wfm.

wfm.Corpus - Corpus method for wfm.

Note

Words can be kept as one by inserting a double tilde ("~~"), or other character strings passed to char2space, as a single word/entry. This is useful for keeping proper names as a single unit.

Examples

## <strong>Not run</strong>: 
# ## word frequency matrix (wfm) example:
# with(DATA, wfm(state, list(sex, adult)))[1:15, ]
# with(DATA, wfm(state, person))[1:15, ]
# Filter(with(DATA, wfm(state, list(sex, adult))), 5)
# with(DATA, wfm(state, list(sex, adult)))
# 
# ## Filter particular words based on max/min values in wfm
# v <- with(DATA, wfm(state, list(sex, adult)))
# Filter(v, 5)
# Filter(v, 5, count.apostrophe = FALSE)
# Filter(v, 5, 7)
# Filter(v, 4, 4)
# Filter(v, 3, 4)
# Filter(v, 3, 4, stopwords = Top25Words)
# 
# ## insert double tilde ("~~") to keep phrases(i.e., first last name)
# alts <- c(" fun", "I ")
# state2 <- space_fill(DATA$state, alts, rm.extra = FALSE)
# with(DATA, wfm(state2, list(sex, adult)))[1:18, ]
# 
# ## word frequency dataframe (wfdf) example:
# with(DATA, wfdf(state, list(sex, adult)))[1:15, ]
# with(DATA, wfdf(state, person))[1:15, ]
# 
# ## wfm_expanded example:
# z <- wfm(DATA$state, DATA$person)
# wfm_expanded(z)[30:45, ] #two "you"s
# 
# ## wf_combine examples:
# #===================
# ## raw no margins (will work)
# x <- wfm(DATA$state, DATA$person)
# 
# ## raw with margin (will work)
# y <- wfdf(DATA$state, DATA$person, margins = TRUE)
# 
# ## Proportion matrix
# z2 <- wfm(DATA$state, DATA$person, output="proportion")
# 
# WL1 <- c(y[, 1])
# WL2 <- list(c("read", "the", "a"), c("you", "your", "you're"))
# WL3 <- list(bob = c("read", "the", "a"), yous = c("you", "your", "you're"))
# WL4 <- list(bob = c("read", "the", "a"), yous = c("a", "you", "your", "your're"))
# WL5 <- list(yous = c("you", "your", "your're"))
# WL6 <- list(c("you", "your", "your're"))  #no name so will be called words 1
# WL7 <- c("you", "your", "your're")
# 
# wfm_combine(z2, WL2) #Won't work not a raw frequency matrix
# wfm_combine(x, WL2)  #Works (raw and no margins)
# wfm_combine(y, WL2)  #Works (raw with margins)
# wfm_combine(y, c("you", "your", "your're"))
# wfm_combine(y, WL1)
# wfm_combine(y, WL3)
# ## wfm_combine(y, WL4) #Error
# wfm_combine(y, WL5)
# wfm_combine(y, WL6)
# wfm_combine(y, WL7)
# 
# worlis <- c("you", "it", "it's", "no", "not", "we")
# y <- wfdf(DATA$state, list(DATA$sex, DATA$adult), margins = TRUE)
# z <- wfm_combine(y, worlis)
# 
# chisq.test(z)
# chisq.test(wfm(y))
# 
# ## Dendrogram
# presdeb <- with(pres_debates2012, wfm(dialogue, list(person, time)))
# library(sjPlot)
# sjc.dend(t(presdeb), 2:4)
# 
# ## Words correlated within turns of talk
# ## EXAMPLE 1
# library(reports)
# x <- factor(with(rajSPLIT, paste(act, pad(TOT(tot)), sep = "|")))
# dat <- wfm(rajSPLIT$dialogue, x)
# 
# cor(t(dat)[, c("romeo", "juliet")])
# cor(t(dat)[, c("romeo", "banished")])
# cor(t(dat)[, c("romeo", "juliet", "hate", "love")])
# qheat(cor(t(dat)[, c("romeo", "juliet", "hate", "love")]),
#     diag.na = TRUE, values = TRUE, digits = 3, by.column = NULL)
# 
# dat2 <- wfm(DATA$state, id(DATA))
# qheat(cor(t(dat2)), low = "yellow", high = "red",
#     grid = "grey90", diag.na = TRUE, by.column = NULL)
# 
# ## EXAMPLE 2
# x2 <- factor(with(pres_debates2012, paste(time, pad(TOT(tot)), sep = "|")))
# dat2 <- wfm(pres_debates2012$dialogue, x2)
# wrds <- word_list(pres_debates2012$dialogue,
#     stopwords = c("it's", "that's", Top200Words))
# wrds2 <- tolower(sort(wrds$rfswl[[1]][, 1]))
# qheat(word_cor(t(dat2), word = wrds2, r = NULL),
#     diag.na = TRUE, values = TRUE, digits = 3, by.column = NULL,
#     high="red", low="yellow", grid=NULL)
# 
# ## EXAMPLE 3
# library(gridExtra); library(ggplot2); library(grid)
# dat3 <- lapply(qcv(OBAMA, ROMNEY), function(x) {
#     with(pres_debates2012, wfm(dialogue[person == x], x2[person == x]))
# })
# 
# 
# # Presidential debates by person
# dat5 <- pres_debates2012
# dat5 <- dat5[dat5$person %in% qcv(ROMNEY, OBAMA), ]
# 
# disp <- with(dat5, dispersion_plot(dialogue, wrds2, grouping.var = person,
#     total.color = NULL, rm.vars=time))
# 
# 
# cors <- lapply(dat3, function(m) {
#     word_cor(t(m), word = wrds2, r = NULL)
# })
# 
# plots <- lapply(cors, function(x) {
#     qheat(x, diag.na = TRUE, values = TRUE, digits = 3, plot = FALSE,
#     by.column = NULL, high="red", low="yellow", grid=NULL)
# })
# 
# plots <- lapply(1:2, function(i) {
#     plots[[i]] + ggtitle(qcv(OBAMA, ROMNEY)[i]) +
#     theme(axis.title.x = element_blank(),
#         plot.margin = unit(rep(0, 4), "lines"))
# })
# 
# grid.arrange(disp, arrangeGrob(plots[[1]], plots[[2]], ncol=1), ncol=2)
# 
# ## With `word_cor`
# worlis <- list(
#     pronouns = c("you", "it", "it's", "we", "i'm", "i"),
#     negative = qcv(no, dumb, distrust, not, stinks),
#     literacy = qcv(computer, talking, telling)
# )
# y <- wfdf(DATA$state, qdapTools::id(DATA, prefix = TRUE))
# z <- wfm_combine(y, worlis)
# 
# word_cor(t(z), word = names(worlis), r = NULL)
# 
# ## Plotting method
# plot(y, TRUE)
# plot(z)
# 
# ## Correspondence Analysis
# library(ca)
# 
# dat <- pres_debates2012
# dat <- dat[dat$person %in% qcv(ROMNEY, OBAMA), ]
# 
# speech <- stemmer(dat$dialogue)
# mytable1 <- with(dat, wfm(speech, list(person, time), stopwords = Top25Words))
# 
# fit <- ca(mytable1)
# summary(fit)
# plot(fit)
# plot3d.ca(fit, labels=1)
# 
# 
# mytable2 <- with(dat, wfm(speech, list(person, time), stopwords = Top200Words))
# 
# fit2 <- ca(mytable2)
# summary(fit2)
# plot(fit2)
# plot3d.ca(fit2, labels=1)
# 
# ## Weight a wfm
# WFM <- with(DATA, wfm(state, list(sex, adult)))
# plot(weight(WFM, "scaled"), TRUE)
# weight(WFM, "prop")
# weight(WFM, "max")
# weight(WFM, "scaled")
# ## <strong>End(Not run)</strong>