This report performs exploratory analysis on the textual data to understand how to build a n-gram based next word predictor.
In this report I do the following; 1. download the data and calculate basic statistics such as file size and line counts. 2. create a %1 sample from the data, for testing purposes. When I build my final predictor I can model based on a larger sample. 3. calculate frequencies of n-grams for when N = 1,2,3,4 and plot them as a histogram. 4. build a basic predictor model which tries to find the most frequent phrase that matches the searched phrase.
#url = 'https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip'
#download.file(url,"Coursera-SwiftKey.zip")
#unzip("Coursera-SwiftKey.zip")
setwd("C:/www/coursera/data-science/datasciencecoursera/course10")
text_dir = 'final/en_US'
text_files <- list.files(text_dir, pattern = "\\.txt$", full.names = TRUE)
# Create a new data frame with specified columns
df <- data.frame(
File = character(),
Size = character(),
LineCount = character(),
LongestLength = character(),
stringsAsFactors = FALSE
)
for(i in 1:3)
{
file_size <- file.info(text_files[i])$size
lines <- readLines(text_files[i])
num_lines <- length(lines)
longest_line_length <- max(nchar(lines))
df <- rbind(df, data.frame(File = text_files[i],
Size = file_size,
LineCount = num_lines,
LongestLength=longest_line_length))
}
print(df)
## File Size LineCount LongestLength
## 1 final/en_US/en_US.blogs.txt 210160014 899288 40833
## 2 final/en_US/en_US.news.txt 205811887 77259 5760
## 3 final/en_US/en_US.twitter.txt 167105338 2360148 144
count_words <- function(filepath, search_string, print=FALSE) {
lines <- readLines(filepath)
count_lines = 0
for (line in lines) {
if (grepl(search_string, line, ignore.case = TRUE)) {
count_lines <- count_lines + 1
if(print){
print(line)
}
}
}
return (count_lines)
}
counts_love <- count_words(text_files[3],"love")
counts_hate <- count_words(text_files[3], "hate")
counts_love/counts_hate
## [1] 4.669125
counts_biostats <- count_words(text_files[3],"biostats",TRUE)
## [1] "i know how you feel.. i have biostats on tuesday and i have yet to study =/"
count_long <- count_words(text_files[3],"A computer once beat me at chess, but it was no match for me at kickboxing", TRUE)
## [1] "A computer once beat me at chess, but it was no match for me at kickboxing"
## [1] "A computer once beat me at chess, but it was no match for me at kickboxing"
## [1] "A computer once beat me at chess, but it was no match for me at kickboxing"
## Sampled lines have been saved to: sample/en_US
NgramTokenizer <- function(x, n) { NGramTokenizer(x, Weka_control(min = n, max = n)) }
create_dtm <- function(corpus, n) {
DocumentTermMatrix(corpus, control = list(tokenize = function(x) NgramTokenizer(x, n)))
}
create_freq <- function(dtm_ngram){
ngram_freq <- colSums(as.matrix(dtm_ngram))
ngram_freq <- sort(ngram_freq, decreasing = TRUE)
return (ngram_freq)
}
# to run on the full sample just change to 'final/en_US/' after proper setwd(your_path) run
text_dir = 'sample/en_US/'
corpus <- VCorpus(DirSource(text_dir))
# Preprocess the text
corpus <- tm_map(corpus, content_transformer(tolower))
#corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
#corpus <- tm_map(corpus, removeWords, stopwords("en"))
#corpus <- tm_map(corpus, stemDocument)
corpus <- tm_map(corpus, stripWhitespace)
# Create a Document-Term Matrix (DTM)
dtm_ng1 <- create_dtm(corpus, 1)
# Find word frequencies
ng1_freq <- create_freq(dtm_ng1)
# Display the top 10 most frequent terms
head(ng1_freq, 10)
## the and you for that with this was have are
## 294285 158726 91568 77629 76104 47798 42683 40906 39814 36451
Let’s create ngrams for N={1,2,3,4}
#dtm_ng1 <- create_dtm(corpus, 1)
dtm_ng2 <- create_dtm(corpus, 2)
dtm_ng3 <- create_dtm(corpus, 3)
dtm_ng4 <- create_dtm(corpus, 4)
# Find ngram frequencies
#ng1_freq <- create_freq(dtm_ng1)
ng2_freq <- create_freq(dtm_ng2)
ng3_freq <- create_freq(dtm_ng3)
ng4_freq <- create_freq(dtm_ng4)
# Display the top 10 most frequent bigrams
cat("Top 10 most frequent bigrams:\n")
## Top 10 most frequent bigrams:
print(head(ng2_freq, 10))
## right now cant wait dont know last night feel like
## 215 167 151 139 115
## can get im going looking forward happy birthday let know
## 113 110 105 100 93
cat("Top 10 most frequent ngrams N=2:\n")
## Top 10 most frequent ngrams N=2:
print(head(ng4_freq, 10))
## gas gas gas gas matt hunter matt hunter
## 9 9
## north dakota snowmobile trail hate andover hate andover
## 9 8
## hunter matt hunter matt andover hate andover hate
## 8 7
## happy mothers day moms justin justin justin justin
## 6 6
## university chicago chicago illinois get real rewards just
## 5 4
cat("Top 10 most frequent ngrams N=3:\n")
## Top 10 most frequent ngrams N=3:
print(head(ng3_freq, 10))
## cant wait see happy mothers day let us know happy new year
## 40 37 20 19
## im pretty sure feel like im new york city new york times
## 15 13 11 11
## cant wait hear gas gas gas
## 10 10
createPlot <- function(n, ngram_freq){
ngram_freq_df <- data.frame(ngram = names(ngram_freq[1:20]), frequency = ngram_freq[1:20])
myplot <- ggplot(ngram_freq_df, aes(x = reorder(ngram, -frequency), y = frequency)) +
geom_bar(stat = "identity", fill = "green", color = "black") +
labs(title = paste0("Histogram of n-gram Frequencies N=",n), x = "n-grams", y = "Frequency") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
return (myplot)
}
# Plot the histogram
plot1 <- createPlot(1, ng1_freq)
plot2 <- createPlot(2, ng2_freq)
plot3 <- createPlot(3, ng3_freq)
plot4 <- createPlot(4, ng4_freq)
# Arrange plots in a 2x2 grid
grid.arrange(plot1, plot2, plot3, plot4, nrow = 2, ncol = 2)
We have word frequency table. We convert it to dataframe and calculate the total number of words from which original frequencies were calculated.
freq_data <- data.frame(ng1_freq)
total_word_count <- sum(freq_data)
half_word_count <- total_word_count*0.5
ninety_percent <- total_word_count*0.9
findWordCount <- function(coverage_count){
cumulative_sum <- 0
counter <- 0
# Iterate through the term frequencies
for (freq in ng1_freq) {
# Initialize cumulative sum and counter
cumulative_sum <- cumulative_sum + freq
counter <- counter + 1
if (cumulative_sum >= coverage_count) {
break
}
}
return (c(counter,cumulative_sum))
}
# Print the result
data50 <- findWordCount(half_word_count)
data90 <- findWordCount(ninety_percent)
cat("Number of unique words needed for 50% coverage:", data50[1], "for 90%", data90[1],"\n")
## Number of unique words needed for 50% coverage: 824 for 90% 14543
First we get the terms vector and run a language detection algorithm on it to find non-English words.
# Get the terms (words)
terms <- names(ng1_freq)
# Display the first few terms
head(terms)
## [1] "just" "like" "will" "one" "can" "get"
library(hunspell)
# limiting to only 1000 terms for the purpose of demonstration
is_english <- sapply(terms[1:1000], function(word) {
hunspell_check(word, dict = "en_US")
})
# Words not in the English dictionary are likely foreign
foreign_words <- terms[!is_english]
head(foreign_words)
## [1] "dont" "lol" "thats" "youre" "ive" "didnt"
cat("Number of foreign words:", length(foreign_words),"\n")
## Number of foreign words: 2508
Yes you need to store the words as stems. For example, “goes” and “going” can be covered with only “go”. You can also use regular expressions if you want to match different word variants.
pattern <- "\\bgo(es|ing)?\\b"
text <- c("She goes to school", "They are going to the park", "I will go home", "Gone with the wind")
matches <- grep(pattern, text, value = TRUE)
print(matches)
## [1] "She goes to school" "They are going to the park"
## [3] "I will go home"
Tokenize the corpus text. In simple terms it just splits the entire text into simple words or word-like pieces of text.
# Convert the corpus to a character vector
corpus_text <- sapply(corpus, as.character)
corpus_text <- unlist(corpus_text)
# Tokenize the text
tokens <- unlist(str_split(corpus_text, " "))
Predictions are of low quality here because our data cleaning is not very effective and the sample size is too small. Many searches do not match a diagram. But this framework will allow me to build the final product in the coming weeks.
getLastWords <- function(v,n){
vlen <- length(v)
i1 <- vlen-n+1
i2 <- vlen
cat(vlen, i1, i2)
if(n>=vlen){
return(v)
} else {
return(v[i1:i2])
}
}
predict_next_word <- function(ngram_freq, previous_words) {
prev_arr <- str_split(previous_words," ")[[1]]
arr_len <- length(prev_arr)
cat("length of words:",arr_len,"\n")
available_freq <- length(ngram_freq)
for(n in arr_len:1)
{
curr_arr <- paste(getLastWords(prev_arr,n),collapse=" ")
cat("Trying:",curr_arr,"\n")
candidates <- grep(paste0("^", curr_arr," "),
names(ngram_freq[n]), value = TRUE)
cat("Candidates:",candidates[1:10],"\n")
if (length(candidates) > 0) {
next_word <- str_split(candidates[which.max(ngram_freq[n][candidates])], " ")[[1]]
return(next_word[n + 1])
} else {
if(n==1){
return(sample(tokens, 1))
}
}
}
}
ng_arr = c(ng2_freq, ng3_freq, ng4_freq)
# Examples of predictions
predict_next_word(ng_arr, "fake fake fake")
## length of words: 3
## 3 1 3Trying: fake fake fake
## Candidates: NA NA NA NA NA NA NA NA NA NA
## 3 2 3Trying: fake fake
## Candidates: NA NA NA NA NA NA NA NA NA NA
## 3 3 3Trying: fake
## Candidates: NA NA NA NA NA NA NA NA NA NA
## [1] "trader"
predict_next_word(ng_arr, "new york")
## length of words: 2
## 2 1 2Trying: new york
## Candidates: NA NA NA NA NA NA NA NA NA NA
## 2 2 2Trying: york
## Candidates: NA NA NA NA NA NA NA NA NA NA
## [1] "check"
predict_next_word(ng_arr, "how are")
## length of words: 2
## 2 1 2Trying: how are
## Candidates: NA NA NA NA NA NA NA NA NA NA
## 2 2 2Trying: are
## Candidates: NA NA NA NA NA NA NA NA NA NA
## [1] "ifwomendidnotexist"
predict_next_word(ng_arr, "cool")
## length of words: 1
## 1 1 1Trying: cool
## Candidates: NA NA NA NA NA NA NA NA NA NA
## [1] "action"
# Function to calculate perplexity for any n-gram frequency parameter
calculate_perplexity <- function(test_sentence, ngram_freq, n) {
# Split the test sentence into words
test_ngrams <- unlist(strsplit(test_sentence, " "))
# Calculate the number of n-grams in the test sentence
N <- length(test_ngrams) - (n - 1)
# Initialize the log probability sum
log_prob_sum <- 0
# Loop through the test n-grams
for (i in 1:N) {
ngram <- paste(test_ngrams[i:(i + n - 1)], collapse = " ")
if (ngram %in% names(ngram_freq)) {
log_prob_sum <- log_prob_sum + log(ngram_freq[ngram] / sum(ngram_freq))
} else {
# Add a small probability for unseen n-grams (smoothing)
log_prob_sum <- log_prob_sum + log(1 / (sum(ngram_freq) + 1))
}
}
# Calculate perplexity
perplexity <- exp(-log_prob_sum / N)
return(perplexity)
}
# Example test sentence
test_sentence <- "this is an example sentence"
# "employees from submitting fraudulent invoices or to block employees from accessing
# inappropriate websites were not in place"
# Calculate perplexity for trigrams
perplexity1 <- calculate_perplexity(test_sentence, ng1_freq, 1)
perplexity2 <- calculate_perplexity(test_sentence, ng2_freq, 2)
perplexity3 <- calculate_perplexity(test_sentence, ng3_freq, 3)
# Display perplexity
cat(perplexity1,perplexity2,perplexity3)
## 57208.44 352465 320093
#tests
tests = c('a case of', 'would mean the', 'make me the',
'struggling but the', 'date at the', 'be on my', 'in quite some',
'with his little', 'faith during the',
'you must be')
for(i in 1:length(tests)) {
print(predict_next_word(ng4_freq, tests[i]))
}
## length of words: 3
## 3 1 3Trying: a case of
## Candidates: NA NA NA NA NA NA NA NA NA NA
## 3 2 3Trying: case of
## Candidates: NA NA NA NA NA NA NA NA NA NA
## 3 3 3Trying: of
## Candidates: NA NA NA NA NA NA NA NA NA NA
## [1] "austin"
## length of words: 3
## 3 1 3Trying: would mean the
## Candidates: NA NA NA NA NA NA NA NA NA NA
## 3 2 3Trying: mean the
## Candidates: NA NA NA NA NA NA NA NA NA NA
## 3 3 3Trying: the
## Candidates: NA NA NA NA NA NA NA NA NA NA
## [1] "lmao"
## length of words: 3
## 3 1 3Trying: make me the
## Candidates: NA NA NA NA NA NA NA NA NA NA
## 3 2 3Trying: me the
## Candidates: NA NA NA NA NA NA NA NA NA NA
## 3 3 3Trying: the
## Candidates: NA NA NA NA NA NA NA NA NA NA
## [1] "“scripture"
## length of words: 3
## 3 1 3Trying: struggling but the
## Candidates: NA NA NA NA NA NA NA NA NA NA
## 3 2 3Trying: but the
## Candidates: NA NA NA NA NA NA NA NA NA NA
## 3 3 3Trying: the
## Candidates: NA NA NA NA NA NA NA NA NA NA
## [1] ""
## length of words: 3
## 3 1 3Trying: date at the
## Candidates: NA NA NA NA NA NA NA NA NA NA
## 3 2 3Trying: at the
## Candidates: NA NA NA NA NA NA NA NA NA NA
## 3 3 3Trying: the
## Candidates: NA NA NA NA NA NA NA NA NA NA
## [1] ""
## length of words: 3
## 3 1 3Trying: be on my
## Candidates: NA NA NA NA NA NA NA NA NA NA
## 3 2 3Trying: on my
## Candidates: NA NA NA NA NA NA NA NA NA NA
## 3 3 3Trying: my
## Candidates: NA NA NA NA NA NA NA NA NA NA
## [1] "stir"
## length of words: 3
## 3 1 3Trying: in quite some
## Candidates: NA NA NA NA NA NA NA NA NA NA
## 3 2 3Trying: quite some
## Candidates: NA NA NA NA NA NA NA NA NA NA
## 3 3 3Trying: some
## Candidates: NA NA NA NA NA NA NA NA NA NA
## [1] "🀦🀦"
## length of words: 3
## 3 1 3Trying: with his little
## Candidates: NA NA NA NA NA NA NA NA NA NA
## 3 2 3Trying: his little
## Candidates: NA NA NA NA NA NA NA NA NA NA
## 3 3 3Trying: little
## Candidates: NA NA NA NA NA NA NA NA NA NA
## [1] "england’s"
## length of words: 3
## 3 1 3Trying: faith during the
## Candidates: NA NA NA NA NA NA NA NA NA NA
## 3 2 3Trying: during the
## Candidates: NA NA NA NA NA NA NA NA NA NA
## 3 3 3Trying: the
## Candidates: NA NA NA NA NA NA NA NA NA NA
## [1] "beal"
## length of words: 3
## 3 1 3Trying: you must be
## Candidates: NA NA NA NA NA NA NA NA NA NA
## 3 2 3Trying: must be
## Candidates: NA NA NA NA NA NA NA NA NA NA
## 3 3 3Trying: be
## Candidates: NA NA NA NA NA NA NA NA NA NA
## [1] "write"