pos. qdap 2.2.0

Usage

pos(text.var, parallel = FALSE, cores = detectCores()/2, progress.bar = TRUE, na.omit = FALSE, digits = 1, percent = TRUE, zero.replace = 0, gc.rate = 10)
pos_by(text.var, grouping.var = NULL, digits = 1, percent = TRUE, zero.replace = 0, ...)
pos_tags(type = "pretty")

Arguments

text.var: The text variable.
parallel: logical. If TRUE attempts to run the function on multiple cores. Note that this may not mean a speed boost if you have one core or if the data set is smaller as the cluster takes time to create.
cores: The number of cores to use if parallel = TRUE. Default is half the number of available cores.
progress.bar: logical. If TRUE attempts to provide a OS appropriate progress bar. If parallel is TRUE this argument is ignored. Note that setting this argument to TRUE may slow down the function.
na.omit: logical. If TRUE missing values (NA) will be omitted.
digits: Integer; number of decimal places to round when printing.
percent: logical. If TRUE output given as percent. If FALSE the output is proportion.
zero.replace: Value to replace 0 values with.
gc.rate: An integer value. This is a necessary argument because of a problem with the garbage collection in the openNLP function that pos wraps. Consider adjusting this argument upward if the error java.lang.OutOfMemoryError occurs.
grouping.var: The grouping variables. Default NULL generates one word list for all text. Also takes a single grouping variable or a list of 1 or more grouping variables.
type: An optional character string giving the output of the pos tags. This must be one of the strings "pretty" (a left justified version of the output optimized for viewing but not good for export), "matrix" (a matrix version of the output), "dataframe"\ "df" (a dataframe version of the output), "all" (a list of all three of the previous output types).
...: Other argument supplied to pos.

Parts of Speech Tagging

Value

pos - returns a list of 4: textThe original text POStaggedThe original words replaced with parts of speech in context. POSpropDataframe of the proportion of parts of speech by row. POSfreqDataframe of the frequency of parts of speech by row. POSrnpDataframe of the frequency and proportions of parts of speech by row. percentThe value of percent used for plotting purposes. zero.replaceThe value of zero.replace used for plotting purposes.

pos_by - returns a list of 6: textThe original text POStaggedThe original words replaced with parts of speech in context. POSpropDataframe of the proportion of parts of speech by row. POSfreqDataframe of the frequency of parts of speech by row. POSrnpDataframe of the frequency and proportions of parts of speech by row. pos.by.propDataframe of the proportion of parts of speech by grouping variable. pos.by.freqDataframe of the frequency of parts of speech by grouping variable. pos.by.rnpDataframe of the frequency and proportions of parts of speech by grouping variable. percentThe value of percent used for plotting purposes. zero.replaceThe value of zero.replace used for plotting purposes.

Description

pos - Apply part of speech tagger to transcript(s).

pos_by - Apply part of speech tagger to transcript(s) by zero or more grouping variable(s).

pos_tags - Useful for interpreting the parts of speech tags created by pos and pos_by.

References

http:/opennlp.apache.org

Examples

## <strong>Not run</strong>: 
# posdat <- pos(DATA$state)
# ltruncdf(posdat, 7, 4)
# ## str(posdat)
# names(posdat)
# posdat$text           #original text
# 
# ## Methods
# preprocessed(posdat)  #words replaced with parts of speech
# counts(posdat)        #frequency of parts of speech by row
# proportions(posdat)   #proportion of parts of speech by row
# 
# ## Methods Plotting
# plot(preprocessed(posdat))
# plot(counts(posdat))
# plot(proportions(posdat))
# plot(posdat)
# 
# out1 <- pos(DATA$state, parallel = TRUE) # not always useful
# ltruncdf(out1, 7, 4)
# 
# #use pos_tags to interpret part of speech tags used by pos & pos_by
# pos_tags()[1:10, ]
# pos_tags("matrix")[1:10, ]
# pos_tags("dataframe")[1:10, ]
# pos_tags("df")[1:10, ]
# ltruncdf(pos_tags("all"), 3)
# 
# posbydat <- with(DATA, pos_by(state, sex))
# names(posbydat)
# 
# ## Methods
# scores(posbydat)
# preprocessed(posbydat)
# counts(posbydat)
# proportions(posbydat)
# 
# ## Methods Plotting
# plot(preprocessed(posbydat))
# plot(counts(posbydat))
# plot(proportions(posbydat))
# plot(posbydat)
# 
# ltruncdf(posbydat, 7, 4)
# truncdf(posbydat$pos.by.prop, 4)
# 
# POSby <- with(DATA, pos_by(state, list(adult, sex)))
# plot(POSby, values = TRUE, digits = 2)
# #or more quickly - reuse the output from before
# out2 <- with(DATA, pos_by(posbydat, list(adult, sex)))
# 
# ## Definite/Indefinite Noun
# ## 2 approached compared...
# ## The later is more efficient but less accurate
# 
# ## ------------------------##
# ## Part off speech tagging ##
# ## ------------------------##
# pos_after <- function(text.var, words, pos){
# 
#     posses <- strsplit(as.character(text.var[["POStagged"]][["POStagged"]]), "\\s+")
#     namespos <- lapply(posses, function(x) {
#         y <- unlist(strsplit(x, "/"))
#         setNames(y[c(TRUE, FALSE)], y[c(FALSE, TRUE)])
#     })
# 
#     lapply(namespos, function(x, thewords = words, thepos = pos){
#         locs <- which(x %in% thewords)
#         locs <- locs[!is.na(locs)]
# 
#         if (identical(unclass(locs), integer(0))) return(NA_character_)
# 
#         nounlocs <- which(names(x) %in% thepos)
# 
#         unname(x[unique(sapply(locs, function(x){
#             min(nounlocs[nounlocs - x > 0])
#         }))])
#     })
# }
# 
# out2 <- setNames(lapply(list(a=c("a", "an"), the="the"), function(x) {
#     o <- pos_after(rajPOS, x, c("NN", "NNS", "NNP", "NNPS"))
#     m <- qdapTools::matrix2df(data.frame(freq=sort(table(unlist(o)), TRUE)), "word")
#     m[m$freq> 3, ]
# }), c("a", "the"))
# 
# 
# dat2 <- setNames(Reduce(function(x, y) {
#     merge(x, y, by = "word", all = TRUE)}, out2), c("Word", "A", "THE"))
# 
# dat2 <- reshape2::melt(dat2, id="Word", variable.name="Article", value.name="freq")
# 
# dat2 <- dat2[order(dat2$freq, dat2$Word), ]
# 
# ord2 <- aggregate(freq ~ Word, dat2, sum)
# 
# dat2$Word <- factor(dat2$Word, levels=ord2[order(ord2[[2]]), 1])
# rownames(dat2) <- NULL
# ggplot(dat2, aes(x=freq, y=Word)) +
#     geom_point()+ facet_grid(~Article) +
#     ggtitle("Part Of Speech Parsing Approach")
# 
# dev.new()
# 
# ## --------------------##
# ## Regular Expressions ##
# ## --------------------##
# 
# library(qdapRegex);library(ggplot2);library(reshape2)
# 
# out <- setNames(lapply(c("@after_a", "@after_the"), function(x) {
#     o <- rm_default(stringi:::stri_trans_tolower(raj$dialogue),
#         pattern = x, extract=TRUE)
#     m <- qdapTools::matrix2df(data.frame(freq=sort(table(unlist(o)), TRUE)), "word")
#     m[m$freq> 3, ]
# }), c("a", "the"))
# 
# 
# dat <- setNames(Reduce(function(x, y) {
#     merge(x, y, by = "word", all = TRUE)}, out), c("Word", "A", "THE"))
# 
# dat <- reshape2::melt(dat, id="Word", variable.name="Article", value.name="freq")
# 
# dat <- dat[order(dat$freq, dat$Word), ]
# 
# ord <- aggregate(freq ~ Word, dat, sum)
# 
# dat$Word <- factor(dat$Word, levels=ord[order(ord[[2]]), 1])
# rownames(dat) <- NULL
# ggplot(dat, aes(x=freq, y=Word)) +
#     geom_point()+ facet_grid(~Article) +
#     ggtitle("Regex Approach")
# ## <strong>End(Not run)</strong>

Parts of Speech Tagging

Usage

Arguments

Parts of Speech Tagging

Value

Description

References

Examples

See also