pos(text.var, parallel = FALSE, cores = detectCores()/2, progress.bar = TRUE, na.omit = FALSE, digits = 1, percent = TRUE, zero.replace = 0, gc.rate = 10)pos_by(text.var, grouping.var = NULL, digits = 1, percent = TRUE, zero.replace = 0, ...)pos_tags(type = "pretty")
TRUE
attempts to run the function on
multiple cores. Note that this may not mean a speed boost if you have one
core or if the data set is smaller as the cluster takes time to create.parallel = TRUE
. Default
is half the number of available cores.TRUE
attempts to provide a OS
appropriate progress bar. If parallel is TRUE
this argument is
ignored. Note that setting this argument to TRUE
may slow down the
function.TRUE
missing values (NA
) will be
omitted.TRUE
output given as percent. If
FALSE
the output is proportion.pos
wraps. Consider adjusting this argument upward if
the error java.lang.OutOfMemoryError
occurs.NULL
generates
one word list for all text. Also takes a single grouping variable or a list
of 1 or more grouping variables."pretty"
(a left justified version of
the output optimized for viewing but not good for export), "matrix"
(a matrix version of the output), "dataframe"
\ "df"
(a
dataframe version of the output), "all"
(a list of all three of the
previous output types).pos
.pos
- returns a list of 4:
textThe original text
POStaggedThe original words replaced with parts of speech in context.
POSpropDataframe of the proportion of parts of speech by row.
POSfreqDataframe of the frequency of parts of speech by row.
POSrnpDataframe of the frequency and proportions of parts of speech
by row.
percentThe value of percent used for plotting purposes.
zero.replaceThe value of zero.replace used for plotting purposes.
pos_by
- returns a list of 6:
textThe original text
POStaggedThe original words replaced with parts of speech in context.
POSpropDataframe of the proportion of parts of speech by row.
POSfreqDataframe of the frequency of parts of speech by row.
POSrnpDataframe of the frequency and proportions of parts of speech
by row.
pos.by.propDataframe of the proportion of parts of speech by grouping
variable.
pos.by.freqDataframe of the frequency of parts of speech by grouping
variable.
pos.by.rnpDataframe of the frequency and proportions of parts of
speech by grouping variable.
percentThe value of percent used for plotting purposes.
zero.replaceThe value of zero.replace used for plotting purposes.
pos
- Apply part of speech tagger to transcript(s).
pos_by
- Apply part of speech tagger to transcript(s) by zero or more
grouping variable(s).
pos_tags
- Useful for interpreting the parts of speech tags created by
pos and pos_by.
## <strong>Not run</strong>: # posdat <- pos(DATA$state) # ltruncdf(posdat, 7, 4) # ## str(posdat) # names(posdat) # posdat$text #original text # # ## Methods # preprocessed(posdat) #words replaced with parts of speech # counts(posdat) #frequency of parts of speech by row # proportions(posdat) #proportion of parts of speech by row # # ## Methods Plotting # plot(preprocessed(posdat)) # plot(counts(posdat)) # plot(proportions(posdat)) # plot(posdat) # # out1 <- pos(DATA$state, parallel = TRUE) # not always useful # ltruncdf(out1, 7, 4) # # #use pos_tags to interpret part of speech tags used by pos & pos_by # pos_tags()[1:10, ] # pos_tags("matrix")[1:10, ] # pos_tags("dataframe")[1:10, ] # pos_tags("df")[1:10, ] # ltruncdf(pos_tags("all"), 3) # # posbydat <- with(DATA, pos_by(state, sex)) # names(posbydat) # # ## Methods # scores(posbydat) # preprocessed(posbydat) # counts(posbydat) # proportions(posbydat) # # ## Methods Plotting # plot(preprocessed(posbydat)) # plot(counts(posbydat)) # plot(proportions(posbydat)) # plot(posbydat) # # ltruncdf(posbydat, 7, 4) # truncdf(posbydat$pos.by.prop, 4) # # POSby <- with(DATA, pos_by(state, list(adult, sex))) # plot(POSby, values = TRUE, digits = 2) # #or more quickly - reuse the output from before # out2 <- with(DATA, pos_by(posbydat, list(adult, sex))) # # ## Definite/Indefinite Noun # ## 2 approached compared... # ## The later is more efficient but less accurate # # ## ------------------------## # ## Part off speech tagging ## # ## ------------------------## # pos_after <- function(text.var, words, pos){ # # posses <- strsplit(as.character(text.var[["POStagged"]][["POStagged"]]), "\\s+") # namespos <- lapply(posses, function(x) { # y <- unlist(strsplit(x, "/")) # setNames(y[c(TRUE, FALSE)], y[c(FALSE, TRUE)]) # }) # # lapply(namespos, function(x, thewords = words, thepos = pos){ # locs <- which(x %in% thewords) # locs <- locs[!is.na(locs)] # # if (identical(unclass(locs), integer(0))) return(NA_character_) # # nounlocs <- which(names(x) %in% thepos) # # unname(x[unique(sapply(locs, function(x){ # min(nounlocs[nounlocs - x > 0]) # }))]) # }) # } # # out2 <- setNames(lapply(list(a=c("a", "an"), the="the"), function(x) { # o <- pos_after(rajPOS, x, c("NN", "NNS", "NNP", "NNPS")) # m <- qdapTools::matrix2df(data.frame(freq=sort(table(unlist(o)), TRUE)), "word") # m[m$freq> 3, ] # }), c("a", "the")) # # # dat2 <- setNames(Reduce(function(x, y) { # merge(x, y, by = "word", all = TRUE)}, out2), c("Word", "A", "THE")) # # dat2 <- reshape2::melt(dat2, id="Word", variable.name="Article", value.name="freq") # # dat2 <- dat2[order(dat2$freq, dat2$Word), ] # # ord2 <- aggregate(freq ~ Word, dat2, sum) # # dat2$Word <- factor(dat2$Word, levels=ord2[order(ord2[[2]]), 1]) # rownames(dat2) <- NULL # ggplot(dat2, aes(x=freq, y=Word)) + # geom_point()+ facet_grid(~Article) + # ggtitle("Part Of Speech Parsing Approach") # # dev.new() # # ## --------------------## # ## Regular Expressions ## # ## --------------------## # # library(qdapRegex);library(ggplot2);library(reshape2) # # out <- setNames(lapply(c("@after_a", "@after_the"), function(x) { # o <- rm_default(stringi:::stri_trans_tolower(raj$dialogue), # pattern = x, extract=TRUE) # m <- qdapTools::matrix2df(data.frame(freq=sort(table(unlist(o)), TRUE)), "word") # m[m$freq> 3, ] # }), c("a", "the")) # # # dat <- setNames(Reduce(function(x, y) { # merge(x, y, by = "word", all = TRUE)}, out), c("Word", "A", "THE")) # # dat <- reshape2::melt(dat, id="Word", variable.name="Article", value.name="freq") # # dat <- dat[order(dat$freq, dat$Word), ] # # ord <- aggregate(freq ~ Word, dat, sum) # # dat$Word <- factor(dat$Word, levels=ord[order(ord[[2]]), 1]) # rownames(dat) <- NULL # ggplot(dat, aes(x=freq, y=Word)) + # geom_point()+ facet_grid(~Article) + # ggtitle("Regex Approach") # ## <strong>End(Not run)</strong>