as.tdm(text.var, grouping.var = NULL, vowel.check = TRUE, ...)as.TermDocumentMatrix(text.var, grouping.var = NULL, vowel.check = TRUE, ...)as.dtm(text.var, grouping.var = NULL, vowel.check = TRUE, ...)as.DocumentTermMatrix(text.var, grouping.var = NULL, vowel.check = TRUE, ...)"as.tdm"(text.var, grouping.var = NULL, vowel.check = TRUE, ...)"as.tdm"(text.var, grouping.var = NULL, vowel.check = TRUE, ...)"as.tdm"(text.var, grouping.var = NULL, vowel.check = TRUE, ...)"as.dtm"(text.var, grouping.var = NULL, vowel.check = TRUE, ...)"as.dtm"(text.var, grouping.var = NULL, vowel.check = TRUE, ...)"as.dtm"(text.var, grouping.var = NULL, vowel.check = TRUE, ...)"as.tdm"(text.var, grouping.var = NULL, vowel.check = TRUE, ...)"as.dtm"(text.var, grouping.var = NULL, vowel.check = TRUE, ...)"as.data.frame"(x, row.names, optional, ..., doc = "docs", text = "text", sent.split = FALSE)as.Corpus(text.var, grouping.var = NULL, demographic.vars, ...)"as.Corpus"(text.var, grouping.var = NULL, demographic.vars, ...)"as.Corpus"(text.var, grouping.var = NULL, demographic.vars, ...)apply_as_tm(wfm.obj, tmfun, ..., to.qdap = TRUE)apply_as_df(tm.corpus, qdapfun, ..., stopwords = NULL, min = 1, max = Inf, count.apostrophe = TRUE, ignore.case = TRUE)"as.Corpus"(text.var, ...)"as.Corpus"(text.var, ...)"as.Corpus"(text.var, ...)
wfm
object.NULL
generates
one word list for all text. Also takes a single grouping variable or a list
of 1 or more grouping variables.Corpus
object.NULL
or a character vector giving the row names for
the data frame. Not used in qdap; for base generic consistency.TRUE
, setting row names and converting
column names is optional. Not used in qdap; for base generic consistency.Corpus
documents.Corpus
text.TRUE
the text variable sentences will
be split into individual rows.Corpus
.wfm
object.TermDocumentMatrix
object.TRUE
should wfm
try to
coerce the output back to a qdap object.Corpus
object.Top200Words
, Top100Words
, Top25Words
. For the tm
package's traditional English stop words use tm::stopwords("english")
.TRUE
apostrophes are counted as
characters.TRUE
stop words will be removed
regardless of case.wfm
TermDocumentMatrix
sentSplit
Corpus
as.tdm
- Returns a TermDocumentMatrix
.
as.TermDocumentMatrix
- Returns a
TermDocumentMatrix
.
as.dtm
- Returns a DocumentTermMatrix
.
as.DocumentTermMatrix
- Returns a
TermDocumentMatrix
.
as.data.frame
- Converts a Corpus
and returns
a qdap oriented data.frame
.
as.Corpus
- Converts a qdap oriented dataframe and returns
a Corpus
.
apply_as_tm
- Applies a tm oriented function to a
wfm
and attempts to simplify back to a
wfm
or weight
format.
apply_as_df
- Returns the output typical of the applied
qdap function.
as.tdm
- Create term document matrices from raw text or
wfm
for use with other text analysis packages.
as.TermDocumentMatrix
- Create document term matrices from raw text or
wfm
for use with other text analysis packages.
as.dtm
- Create document term matrices from raw text or
wfm
for use with other text analysis packages.
as.DocumentTermMatrix
- Create document term matrices from raw text or
wfm
for use with other text analysis packages.
as.tdm.Corpus
- Corpus method for as.tdm
used to
convert to a DocumentTermMatrix
.
as.tdm.default
- Default method for as.tdm
used to
convert to a TermDocumentMatrix
.
as.tdm.character
- character method for as.tdm
used to
convert to a TermDocumentMatrix
.
as.dtm.Corpus
- Corpus method for as.dtm
used to
convert to a DocumentTermMatrix
.
as.dtm.default
- Default method for as.dtm
used to
convert to a DocumentTermMatrix
.
as.dtm.character
- character method for as.dtm
used to
convert to a DocumentTermMatrix
.
as.tdm.wfm
- wfm method for as.tdm
used to
convert to a TermDocumentMatrix
.
as.dtm.wfm
- wfm method for as.dtm
used to
convert to a TermDocumentMatrix
.
as.data.frame
- Convert a tm package Corpus
to
a qdap data.frame
.
as.Corpus
- Attempts to convert its argument into a tm package
Corpus
.
as.Corpus.sent_split
- sent_split
Method for as.Corpus
.
as.Corpus.default
- Default method for as.Corpus
used to
convert vectors (from a data.frame
) to a Corpus
.
apply_as_tm
- Apply functions intended to be used on the tm
package's TermDocumentMatrix
to a wfm
object.
apply_as_df
- Apply a tm Corpus
as a qdap
dataframe.
apply_as_df
- Apply functions intended to be used on the qdap
package's data.frame
+ sentSplit
to
a tm Corpus
object.
as.Corpus.TermDocumentMatrix
- TermDocumentMatrix
method for
as.Corpus
used to convert a Corpus
.
as.Corpus.DocumentTermMatrix
- DocumentTermMatrix
method for
as.Corpus
used to convert a Corpus
.
as.Corpus.wfm
- wfm
method for
as.Corpus
used to convert a Corpus
.
Produces output that is identical to the tm
package's
TermDocumentMatrix
, DocumentTermMatrix
,
Corpus
or allows convenient interface between the qdap and
tm packages.
aply_as_df
coerces to a dataframe with columns named `docs` and
the other named `text`.
## <strong>Not run</strong>: # as.dtm(DATA$state, DATA$person) # as.tdm(DATA$state, DATA$person) # # x <- wfm(DATA$state, DATA$person) # as.tdm(x) # as.dtm(x) # library(tm) # plot(as.tdm(x)) # # pres <- as.tdm(pres_debates2012$dialogue, pres_debates2012$person) # plot(pres, corThreshold = 0.8) # pres # (pres2 <- removeSparseTerms(pres, .3)) # plot(pres2, corThreshold = 0.95) # # shorts <- all_words(pres_debates2012)[,1][nchar(all_words( # pres_debates2012)[,1]) < 4] # # SW <- c(shorts, qdapDictionaries::contractions[, 1], # qdapDictionaries::Top200Words, # "governor", "president", "mister", "obama","romney") # # DocTermMat2 <- with(pres_debates2012, as.dtm(dialogue, list(person, time), stopwords = SW)) # DocTermMat2 <- removeSparseTerms(DocTermMat2,0.95) # (DocTermMat2 <- DocTermMat2[rowSums(as.matrix(DocTermMat2))> 0,]) # plot(DocTermMat2) # # ## Correspondence Analysis # library(ca) # # dat <- pres_debates2012 # dat <- dat[dat$person %in% qcv(ROMNEY, OBAMA), ] # # speech <- stemmer(dat$dialogue) # mytable1 <- with(dat, as.tdm(speech, list(person, time), stopwords = Top25Words)) # # fit <- ca(as.matrix(mytable1)) # summary(fit) # plot(fit) # plot3d.ca(fit, labels=1) # # # mytable2 <- with(dat, as.tdm(speech, list(person, time), stopwords = Top200Words)) # # fit2 <- ca(as.matrix(mytable2)) # summary(fit2) # plot(fit2) # plot3d.ca(fit2, labels=1) # # ## Topic Models # # Example 1 # # library(topicmodels); library(tm) # # # Generate stop words based on short words, frequent words and contractions # shorts <- all_words(pres_debates2012)[,1][nchar(all_words( # pres_debates2012)[,1]) < 4] # # SW <- c(shorts, qdapDictionaries::contractions[, 1], # qdapDictionaries::Top200Words, # "governor", "president", "mister", "obama","romney") # # DocTermMat <- with(pres_debates2012, as.dtm(dialogue, person, stopwords = SW)) # DocTermMat <- removeSparseTerms(DocTermMat,0.999) # DocTermMat <- DocTermMat[rowSums(as.matrix(DocTermMat))> 0,] # # lda.model <- LDA(DocTermMat, 5) # # (topics <- posterior(lda.model, DocTermMat)$topics) # terms(lda.model,20) # # # Plot the Topics Per Person # topic.dat <- matrix2df(topics, "Person") # colnames(topic.dat)[-1] <- paste2(t(terms(lda.model,20)), sep=", ") # # library(reshape2) # mtopic <- melt(topic.dat, variable="Topic", value.name="Proportion") # ggplot(mtopic, aes(weight=Proportion, x=Topic, fill=Topic)) + # geom_bar() + # coord_flip() + # facet_grid(Person~.) + # guides(fill=FALSE) # # # Example 2 # # DocTermMat2 <- with(pres_debates2012, as.dtm(dialogue, list(person, time), stopwords = SW)) # DocTermMat2 <- removeSparseTerms(DocTermMat2,0.95) # DocTermMat2 <- DocTermMat2[rowSums(as.matrix(DocTermMat2))> 0,] # # lda.model2 <- LDA(DocTermMat2, 6) # # (topics2 <- posterior(lda.model2, DocTermMat2)$topics) # terms(lda.model2,20) # qheat(topics2, high="blue", low="yellow", by.col=FALSE) # # # Example 3 # # lda.model3 <- LDA(DocTermMat2, 10) # # (topics3 <- posterior(lda.model3, DocTermMat2)$topics) # terms(lda.model3, 20) # qheat(topics3, high="blue", low="yellow", by.col=FALSE) # # # Plot the Topics Per Person # topic.dat3 <- matrix2df(topics3, "Person&Time") # colnames(topic.dat3)[-1] <- paste2(t(terms(lda.model3, 10)), sep=", ") # topic.dat3 <- colsplit2df(topic.dat3) # # library(reshape2) # library(scales) # mtopic3 <- melt(topic.dat3, variable="Topic", value.name="Proportion") # (p1 <- ggplot(mtopic3, aes(weight=Proportion, x=Topic, fill=Topic)) + # geom_bar() + # coord_flip() + # facet_grid(Person~Time) + # guides(fill=FALSE) + # scale_y_continuous(labels = percent) + # theme(plot.margin = unit(c(1, 0, 0.5, .5), "lines")) + # ylab("Proportion")) # # mtopic3.b <- mtopic3 # mtopic3.b[, "Topic"] <- factor(as.numeric(mtopic3.b[, "Topic"]), levels = 1:10) # mtopic3.b[, "Time"] <- factor(gsub("time ", "", mtopic3.b[, "Time"])) # # p2 <- ggplot(mtopic3.b, aes(x=Time, y=Topic, fill=Proportion)) + # geom_tile(color = "white") + # scale_fill_gradient(low = "grey70", high = "red") + # facet_grid(Person~Time, scales = "free") + # theme(axis.title.y = element_blank(), # axis.text.x= element_text(colour="white"), # axis.ticks.x= element_line(colour="white"), # axis.ticks.y = element_blank(), # axis.text.y= element_blank(), # plot.margin = unit(c(1, -.5, .5, -.9), "lines") # ) # # library(gridExtra) # grid.arrange(p1, p2, nrow=1, widths = c(.85, .15)) # # ## tm Matrices to wfm # library(tm) # data(crude) # # ## A Term Document Matrix Conversion # (tm_in <- TermDocumentMatrix(crude, control = list(stopwords = TRUE))) # converted <- as.wfm(tm_in) # head(converted) # summary(converted) # # ## A Document Term Matrix Conversion # (dtm_in <- DocumentTermMatrix(crude, control = list(stopwords = TRUE))) # summary(as.wfm(dtm_in)) # # ## `apply_as_tm` Examples # ## Create a wfm # a <- with(DATA, wfm(state, list(sex, adult))) # summary(a) # # ## Apply functions meant for a tm TermDocumentMatrix # out <- apply_as_tm(a, tm:::removeSparseTerms, sparse=0.6) # summary(out) # # apply_as_tm(a, tm:::findAssocs, "computer", .8) # apply_as_tm(a, tm:::findFreqTerms, 2, 3) # apply_as_tm(a, tm:::Zipf_plot) # apply_as_tm(a, tm:::Heaps_plot) # apply_as_tm(a, tm:::plot.TermDocumentMatrix, corThreshold = 0.4) # # library(proxy) # apply_as_tm(a, tm:::weightBin) # apply_as_tm(a, tm:::weightBin, to.qdap = FALSE) # apply_as_tm(a, tm:::weightSMART) # apply_as_tm(a, tm:::weightTfIdf) # # ## Convert tm Corpus to Dataframe # ## A tm Corpus # library(tm) # reut21578 <- system.file("texts", "crude", package = "tm") # reuters <- Corpus(DirSource(reut21578), # readerControl = list(reader = readReut21578XML)) # # ## Convert to dataframe # corp_df <- as.data.frame(reuters) # htruncdf(corp_df) # # z <- as.Corpus(DATA$state, DATA$person, # demographic=DATA[, qcv(sex, adult, code)]) # as.data.frame(z) # # ## Apply a qdap function # out <- formality(corp_df$text, corp_df$docs) # plot(out) # # ## Convert a qdap dataframe to tm package Corpus # (x <- with(DATA2, as.Corpus(state, list(person, class, day)))) # library(tm) # inspect(x) # class(x) # # (y <- with(pres_debates2012, as.Corpus(dialogue, list(person, time)))) # # ## Add demographic info to DMetaData of Corpus # z <- as.Corpus(DATA$state, DATA$person, # demographic=DATA[, qcv(sex, adult, code)]) # lview(z) # # lview(as.Corpus(DATA$state, DATA$person, # demographic=DATA$sex)) # # lview(as.Corpus(DATA$state, DATA$person, # demographic=list(DATA$sex, DATA$adult))) # # ## Apply qdap functions meant for dataframes from sentSplit to tm Corpus # library(tm) # reut21578 <- system.file("texts", "crude", package = "tm") # reuters <- Corpus(DirSource(reut21578), # readerControl = list(reader = readReut21578XML)) # # matches <- list( # oil = qcv(oil, crude), # money = c("economic", "money") # ) # # apply_as_df(reuters, word_stats) # apply_as_df(reuters, formality) # apply_as_df(reuters, word_list) # apply_as_df(reuters, polarity) # apply_as_df(reuters, Dissimilarity) # apply_as_df(reuters, diversity) # apply_as_df(reuters, pos_by) # apply_as_df(reuters, flesch_kincaid) # apply_as_df(reuters, trans_venn) # apply_as_df(reuters, gantt_plot) # apply_as_df(reuters, rank_freq_mplot) # apply_as_df(reuters, character_table) # # (termco_out <- apply_as_df(reuters, termco, match.list = matches)) # plot(termco_out, values = TRUE, high="red") # # (wordcor_out <- apply_as_df(reuters, word_cor, word = unlist(matches))) # plot(wordcor_out) # # (f_terms <- apply_as_df(reuters, freq_terms, at.least = 3)) # plot(f_terms) # # apply_as_df(reuters, trans_cloud) # ## To use "all" rather than "docs" as "grouping.var"... # apply_as_df(reuters, trans_cloud, grouping.var=NULL, # target.words=matches, cloud.colors = c("red", "blue", "grey75")) # # finds <- apply_as_df(reuters, freq_terms, at.least = 5, # top = 5, stopwords = Top100Words) # apply_as_df(reuters, dispersion_plot, match.terms = finds[, 1], # total.color = NULL) # # ## Filter for Term Document Matrix/Document Term Matrix # library(tm) # data(crude) # # (tdm_in <- TermDocumentMatrix(crude, control = list(stopwords = TRUE))) # Filter(tdm_in, 5) # # (dtm_in <- DocumentTermMatrix(crude, control = list(stopwords = TRUE))) # Filter(dtm_in, 5) # # ## Filter particular words based on max/min values # Filter(dtm_in, 5, 7) # Filter(dtm_in, 4, 4) # Filter(tdm_in, 3, 4) # Filter(tdm_in, 3, 4, stopwords = Top200Words) # ## <strong>End(Not run)</strong>