termco(text.var, grouping.var = NULL, match.list, short.term = TRUE, ignore.case = TRUE, elim.old = TRUE, percent = TRUE, digits = 2, apostrophe.remove = FALSE, char.keep = NULL, digit.remove = NULL, zero.replace = 0, ...)termco_d(text.var, grouping.var = NULL, match.string, short.term = FALSE, ignore.case = TRUE, zero.replace = 0, percent = TRUE, digits = 2, apostrophe.remove = FALSE, char.keep = NULL, digit.remove = TRUE, ...)term_match(text.var, terms, return.list = TRUE, apostrophe.remove = FALSE)termco2mat(dataframe, drop.wc = TRUE, short.term = TRUE, rm.zerocol = FALSE, no.quote = TRUE, transform = TRUE, trim.terms = TRUE)
NULL
generates
one word list for all text. Also takes a single grouping variable or a list
of 1 or more grouping variables.TRUE
column names are trimmed versions
of the match list, otherwise the terms are wrapped with 'term(phrase)'TRUE
case is ignored.TRUE
eliminates the columns that are
combined together by the named match.list.TRUE
output given as percent. If
FALSE
the output is proportion.TRUE
removes apostrophes from
the text before examining.termco
attempts to auto detect characters to
keep based on the elements in match.list
.TRUE
strips digits from the text
before counting. termco
attempts to auto detect if digits
should be retained based on the elements in match.list
.term_match
the term(s) must be words or partial words but do not have
to be when using termco_d
(i.e., they can be phrases,
symbols etc.).text.var
. Similar to
match.list
but these terms must be words or partial words rather than
multiple words and symbols.TRUE
returns the output for multiple
terms as a list by term rather than a vector.TRUE
the word count column will be
dropped.TRUE
any column containing all zeros
will be removed from the matrix.TRUE
the matrix will be printed without
quotes if it's character.TRUE
the matrix will be transformed.TRUE
trims the column header/names to
ensure there is not a problem with spacing when using in other R functions.strip
.termco
& termco_d
- both return a list, of class
"termco", of data frames and information regarding word counts:
rawraw word counts by grouping variable
propproportional word counts by grouping variable; proportional to
each individual's word use
rnpa character combination data frame of raw and proportional
zero_replacevalue to replace zeros with; mostly internal use
percentThe value of percent used for plotting purposes.
digitsinteger value of number of digits to display; mostly internal
use
term_match
- returns a list or vector of possible words that
match term(s).
termco2mat
- returns a matrix of term counts.
termco
- Search a transcript by any number of grouping variables for
categories (themes) of grouped root terms. While there are other termco
functions in the termco family (e.g., termco_d
)
termco
is a more powerful and flexible wrapper intended for general
use.
termco_d
- Search a transcript by any number of grouping variables for
root terms.
term_match
- Search a transcript for words that exactly match term(s).
termco2mat
- Convert a termco dataframe to a matrix for use with
visualization functions (e.g., heatmap.2
).
The match.list/match.string is (optionally) case and character sensitive. Spacing is an important way to grab specific words and requires careful thought. Using "read" will find the words "bread", "read" "reading", and "ready". If you want to search for just the word "read" you'd supply a vector of c(" read ", " reads", " reading", " reader"). To search for non character arguments (i.e., numbers and symbols) additional arguments from strip must be passed.
Percentages are calculated as a ratio of counts of
match.list
elements to word counts. Word counts do not contain
symbols or digits. Using symbols, digits or small segments of full words
(e.g., "to") could total more than 100%.
## <strong>Not run</strong>: # #termco examples: # # term <- c("the ", "she", " wh") # (out <- with(raj.act.1, termco(dialogue, person, term))) # # plot(out) # scores(out) # plot(scores(out)) # counts(out) # plot(counts(out)) # proportions(out) # plot(proportions(out)) # # # General form for match.list as themes # # # # ml <- list( # # cat1 = c(), # # cat2 = c(), # # catn = c() # # ) # # ml <- list( # cat1 = c(" the ", " a ", " an "), # cat2 = c(" I'" ), # "good", # the = c("the", " the ", " the", "the") # ) # # (dat <- with(raj.act.1, termco(dialogue, person, ml))) # scores(dat) #useful for presenting in tables # counts(dat) #prop and raw counts are useful for performing calculations # proportions(dat) # datb <- with(raj.act.1, termco(dialogue, person, ml, # short.term = FALSE, elim.old=FALSE)) # ltruncdf(datb, 20, 6) # # (dat2 <- data.frame(dialogue=c("@bryan is bryan good @br", # "indeed", "@ brian"), person=qcv(A, B, A))) # # ml2 <- list(wrds=c("bryan", "indeed"), "@", bryan=c("bryan", "@ br", "@br")) # # with(dat2, termco(dialogue, person, match.list=ml2)) # # with(dat2, termco(dialogue, person, match.list=ml2, percent = FALSE)) # # DATA$state[1] <- "12 4 rgfr r0ffrg0" # termco(DATA$state, DATA$person, '0', digit.remove=FALSE) # DATA <- qdap::DATA # # #Using with term_match and exclude # exclude(term_match(DATA$state, qcv(th), FALSE), "truth") # termco(DATA$state, DATA$person, exclude(term_match(DATA$state, qcv(th), # FALSE), "truth")) # MTCH.LST <- exclude(term_match(DATA$state, qcv(th, i)), qcv(truth, stinks)) # termco(DATA$state, DATA$person, MTCH.LST) # # syns <- synonyms("doubt") # syns[1] # termco(DATA$state, DATA$person, unlist(syns[1])) # synonyms("doubt", FALSE) # termco(DATA$state, DATA$person, list(doubt = synonyms("doubt", FALSE))) # termco(DATA$state, DATA$person, syns) # # #termco_d examples: # termco_d(DATA$state, DATA$person, c(" the", " i'")) # termco_d(DATA$state, DATA$person, c(" the", " i'"), ignore.case=FALSE) # termco_d(DATA$state, DATA$person, c(" the ", " i'")) # # # termco2mat example: # MTCH.LST <- exclude(term_match(DATA$state, qcv(a, i)), qcv(is, it, am, shall)) # termco_obj <- termco(DATA$state, DATA$person, MTCH.LST) # termco2mat(termco_obj) # plot(termco_obj) # plot(termco_obj, label = TRUE) # plot(termco_obj, label = TRUE, text.color = "red") # plot(termco_obj, label = TRUE, text.color="red", lab.digits=3) # # ## REVERSE TERMCO (return raw words found per variable) # df <- data.frame(x=1:6, # y = c("the fluffy little bat" , "the man was round like a ball", # "the fluffy little bat" , "the man was round like a ball", # "he ate the chair" , "cough, cough"), # stringsAsFactors=FALSE) # # l <- list("bat" ,"man", "ball", "heavy") # z <- counts(termco(df$y, qdapTools::id(df), l))[, -2] # # counts2list(z[, -1], z[, 1]) # # ## politness # politness <- c("please", "excuse me", "thank you", "you welcome", # "you're welcome", "i'm sorry", "forgive me", "pardon me") # # with(pres_debates2012, termco(dialogue, person, politness)) # with(hamlet, termco(dialogue, person, politness)) # # ## Term Use Percentage per N Words # dat <- with(raj, chunker(dialogue, person, n.words = 100, rm.unequal = TRUE)) # dat2 <- list2df(dat, "Dialogue", "Person") # dat2[["Duration"]] <- unlist(lapply(dat, id, pad=FALSE)) # dat2 <- qdap_df(dat2, "Dialogue") # # Top5 <- sapply(split(raj$dialogue, raj$person), wc, FALSE) %>% # sort(decreasing=TRUE) %>% # list2df("wordcount", "person") %>% # `[`(1:5, 2) # # propdat <- dat2 %&% # termco(list(Person, Duration), as.list(Top25Words[1:5]), percent = FALSE) %>% # proportions %>% # colsplit2df %>% # reshape2::melt(id=c("Person", "Duration", "word.count"), variable="Word") %>% # dplyr::filter(Person %in% Top5) # # head(propdat) # # ggplot(propdat, aes(y=value, x=Duration, group=Person, color=Person)) + # geom_line(size=1.25) + # facet_grid(Word~., scales="free_y") + # ylab("Percent of Word Use") + # xlab("Per 100 Words") + # scale_y_continuous(labels = percent) # # ggplot(propdat, aes(y=value, x=Duration, group=Word, color=Word)) + # geom_line(size=1.25) + # facet_grid(Person~.) + # ylab("Percent of Word Use") + # xlab("Per 100 Words") + # scale_y_continuous(labels = percent) # # ggplot(propdat, aes(y=value, x=Duration, group=Word)) + # geom_line() + # facet_grid(Word~Person, scales="free_y") + # ylab("Percent of Word Use") + # xlab("Per 100 Words") + # scale_y_continuous(labels = percent) + # ggthemes::theme_few() # ## <strong>End(Not run)</strong>
termco_c
,
colcomb2class