Executive summary

This report performs exploratory analysis on the textual data to understand how to build a n-gram based next word predictor.

In this report I do the following; 1. download the data and calculate basic statistics such as file size and line counts. 2. create a %1 sample from the data, for testing purposes. When I build my final predictor I can model based on a larger sample. 3. calculate frequencies of n-grams for when N = 1,2,3,4 and plot them as a histogram. 4. build a basic predictor model which tries to find the most frequent phrase that matches the searched phrase.

Task 1: Getting and cleaning the data

Download the dataset.

#url = 'https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip'
#download.file(url,"Coursera-SwiftKey.zip")
#unzip("Coursera-SwiftKey.zip")

File characteristics

setwd("C:/www/coursera/data-science/datasciencecoursera/course10")
text_dir = 'final/en_US'
text_files <- list.files(text_dir, pattern = "\\.txt$", full.names = TRUE)

# Create a new data frame with specified columns
df <- data.frame(
  File = character(), 
  Size = character(), 
  LineCount = character(),
  LongestLength = character(),
  stringsAsFactors = FALSE
)

for(i in 1:3)
{
    file_size <- file.info(text_files[i])$size
    lines <- readLines(text_files[i])
    num_lines <- length(lines)
    longest_line_length <- max(nchar(lines))
    
    df <- rbind(df, data.frame(File = text_files[i], 
                               Size = file_size,
                               LineCount = num_lines,
                               LongestLength=longest_line_length))
}

print(df)
##                            File      Size LineCount LongestLength
## 1   final/en_US/en_US.blogs.txt 210160014    899288         40833
## 2    final/en_US/en_US.news.txt 205811887     77259          5760
## 3 final/en_US/en_US.twitter.txt 167105338   2360148           144

Find strings

count_words <- function(filepath, search_string, print=FALSE) {
  lines <- readLines(filepath)
  count_lines = 0
  
  for (line in lines) {
    if (grepl(search_string, line, ignore.case = TRUE)) {
      count_lines <- count_lines + 1
      if(print){
          print(line)
      }
    }
  }
  
  return (count_lines)
}

counts_love <- count_words(text_files[3],"love")
counts_hate <- count_words(text_files[3], "hate")
counts_love/counts_hate
## [1] 4.669125
counts_biostats <- count_words(text_files[3],"biostats",TRUE)
## [1] "i know how you feel.. i have biostats on tuesday and i have yet to study =/"
count_long <- count_words(text_files[3],"A computer once beat me at chess, but it was no match for me at kickboxing", TRUE)
## [1] "A computer once beat me at chess, but it was no match for me at kickboxing"
## [1] "A computer once beat me at chess, but it was no match for me at kickboxing"
## [1] "A computer once beat me at chess, but it was no match for me at kickboxing"

Get a random sampling_factor% sample from the files to speed up things

## Sampled lines have been saved to: sample/en_US

Task 2: Exploratory Data Analysis

1. What are the distributions of word frequencies?

NgramTokenizer <- function(x, n) { NGramTokenizer(x, Weka_control(min = n, max = n)) }
create_dtm <- function(corpus, n) {
  DocumentTermMatrix(corpus, control = list(tokenize = function(x) NgramTokenizer(x, n)))
}
create_freq <- function(dtm_ngram){
  ngram_freq <- colSums(as.matrix(dtm_ngram))
  ngram_freq <- sort(ngram_freq, decreasing = TRUE)
  return (ngram_freq)
}

# to run on the full sample just change to 'final/en_US/' after proper setwd(your_path) run
text_dir = 'sample/en_US/'
corpus <- VCorpus(DirSource(text_dir))

# Preprocess the text
corpus <- tm_map(corpus, content_transformer(tolower))
#corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
#corpus <- tm_map(corpus, removeWords, stopwords("en"))
#corpus <- tm_map(corpus, stemDocument)
corpus <- tm_map(corpus, stripWhitespace)

# Create a Document-Term Matrix (DTM)
dtm_ng1 <- create_dtm(corpus, 1)

# Find word frequencies
ng1_freq <- create_freq(dtm_ng1)

# Display the top 10 most frequent terms
head(ng1_freq, 10)
##    the    and    you    for   that   with   this    was   have    are 
## 294285 158726  91568  77629  76104  47798  42683  40906  39814  36451

2. What are the frequencies of 2-grams and 3-grams in the dataset?

Let’s create ngrams for N={1,2,3,4}

#dtm_ng1 <- create_dtm(corpus, 1)
dtm_ng2 <- create_dtm(corpus, 2)
dtm_ng3 <- create_dtm(corpus, 3)
dtm_ng4 <- create_dtm(corpus, 4)

# Find ngram frequencies
#ng1_freq <- create_freq(dtm_ng1)
ng2_freq <- create_freq(dtm_ng2)
ng3_freq <- create_freq(dtm_ng3)
ng4_freq <- create_freq(dtm_ng4)


# Display the top 10 most frequent bigrams
cat("Top 10 most frequent bigrams:\n")
## Top 10 most frequent bigrams:
print(head(ng2_freq, 10))
##       right now       cant wait       dont know      last night       feel like 
##             215             167             151             139             115 
##         can get        im going looking forward  happy birthday        let know 
##             113             110             105             100              93
cat("Top 10 most frequent ngrams N=2:\n")
## Top 10 most frequent ngrams N=2:
print(head(ng4_freq, 10))
##                     gas gas gas gas             matt hunter matt hunter 
##                                   9                                   9 
##       north dakota snowmobile trail           hate andover hate andover 
##                                   9                                   8 
##             hunter matt hunter matt           andover hate andover hate 
##                                   8                                   7 
##              happy mothers day moms         justin justin justin justin 
##                                   6                                   6 
## university chicago chicago illinois               get real rewards just 
##                                   5                                   4
cat("Top 10 most frequent ngrams N=3:\n")
## Top 10 most frequent ngrams N=3:
print(head(ng3_freq, 10))
##     cant wait see happy mothers day       let us know    happy new year 
##                40                37                20                19 
##    im pretty sure      feel like im     new york city    new york times 
##                15                13                11                11 
##    cant wait hear       gas gas gas 
##                10                10

Plotting n-gram frequencies

createPlot <- function(n, ngram_freq){
  ngram_freq_df <- data.frame(ngram = names(ngram_freq[1:20]), frequency = ngram_freq[1:20])
  myplot <- ggplot(ngram_freq_df, aes(x = reorder(ngram, -frequency), y = frequency)) + 
  geom_bar(stat = "identity", fill = "green", color = "black") +
  labs(title = paste0("Histogram of n-gram Frequencies N=",n), x = "n-grams", y = "Frequency") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))
  return (myplot)
}

# Plot the histogram
plot1 <- createPlot(1, ng1_freq)
plot2 <- createPlot(2, ng2_freq)
plot3 <- createPlot(3, ng3_freq)
plot4 <- createPlot(4, ng4_freq)

# Arrange plots in a 2x2 grid
grid.arrange(plot1, plot2, plot3, plot4, nrow = 2, ncol = 2)

How many unique words do you need to cover 50% of all word instances in the language?

We have word frequency table. We convert it to dataframe and calculate the total number of words from which original frequencies were calculated.

freq_data <- data.frame(ng1_freq)
total_word_count <- sum(freq_data)
half_word_count <- total_word_count*0.5
ninety_percent <- total_word_count*0.9

findWordCount <- function(coverage_count){
    cumulative_sum <- 0
    counter <- 0
      
    # Iterate through the term frequencies
    for (freq in ng1_freq) {
      # Initialize cumulative sum and counter
      cumulative_sum <- cumulative_sum + freq
      counter <- counter + 1
      if (cumulative_sum >= coverage_count) {
        break
      }
    }
    return (c(counter,cumulative_sum))
}

# Print the result
data50 <- findWordCount(half_word_count)
data90 <- findWordCount(ninety_percent)
cat("Number of unique words needed for 50% coverage:", data50[1], "for 90%", data90[1],"\n")
## Number of unique words needed for 50% coverage: 824 for 90% 14543

How do you evaluate how many of the words come from foreign languages?

First we get the terms vector and run a language detection algorithm on it to find non-English words.

# Get the terms (words)
terms <- names(ng1_freq)

# Display the first few terms
head(terms)
## [1] "just" "like" "will" "one"  "can"  "get"
library(hunspell)

# limiting to only 1000 terms for the purpose of demonstration
is_english <- sapply(terms[1:1000], function(word) {
  hunspell_check(word, dict = "en_US")
})

# Words not in the English dictionary are likely foreign
foreign_words <- terms[!is_english]
head(foreign_words)
## [1] "dont"  "lol"   "thats" "youre" "ive"   "didnt"
cat("Number of foreign words:", length(foreign_words),"\n")
## Number of foreign words: 2508

Can you think of a way to increase the coverage – identifying words that may not be in the corpora or using a smaller number of words in the dictionary to cover the same number of phrases?

Yes you need to store the words as stems. For example, “goes” and “going” can be covered with only “go”. You can also use regular expressions if you want to match different word variants.

pattern <- "\\bgo(es|ing)?\\b"
text <- c("She goes to school", "They are going to the park", "I will go home", "Gone with the wind")
matches <- grep(pattern, text, value = TRUE)
print(matches)
## [1] "She goes to school"         "They are going to the park"
## [3] "I will go home"

Task 3 - Modelling

Tokenize the corpus text. In simple terms it just splits the entire text into simple words or word-like pieces of text.

# Convert the corpus to a character vector 
corpus_text <- sapply(corpus, as.character)
corpus_text <- unlist(corpus_text)

# Tokenize the text
tokens <- unlist(str_split(corpus_text, " "))

Predictions

Predictions are of low quality here because our data cleaning is not very effective and the sample size is too small. Many searches do not match a diagram. But this framework will allow me to build the final product in the coming weeks.

getLastWords <- function(v,n){
  vlen <- length(v)
  i1 <- vlen-n+1
  i2 <- vlen
  cat(vlen, i1, i2)
 if(n>=vlen){
   return(v)
 } else {
   return(v[i1:i2])
 }
}

predict_next_word <- function(ngram_freq, previous_words) {
  prev_arr <- str_split(previous_words," ")[[1]]
  arr_len <- length(prev_arr)
  cat("length of words:",arr_len,"\n")
  available_freq <- length(ngram_freq)
  for(n in arr_len:1)
  {
    curr_arr <- paste(getLastWords(prev_arr,n),collapse=" ")
    cat("Trying:",curr_arr,"\n")
    candidates <- grep(paste0("^", curr_arr," "), 
                       names(ngram_freq[n]), value = TRUE)
    cat("Candidates:",candidates[1:10],"\n")
    if (length(candidates) > 0) {
      next_word <- str_split(candidates[which.max(ngram_freq[n][candidates])], " ")[[1]]
      return(next_word[n + 1])
    } else {
      if(n==1){
        return(sample(tokens, 1))
      }
    }
  }
}

ng_arr = c(ng2_freq, ng3_freq, ng4_freq)
# Examples of predictions
predict_next_word(ng_arr, "fake fake fake")
## length of words: 3 
## 3 1 3Trying: fake fake fake 
## Candidates: NA NA NA NA NA NA NA NA NA NA 
## 3 2 3Trying: fake fake 
## Candidates: NA NA NA NA NA NA NA NA NA NA 
## 3 3 3Trying: fake 
## Candidates: NA NA NA NA NA NA NA NA NA NA
## [1] "trader"
predict_next_word(ng_arr, "new york")
## length of words: 2 
## 2 1 2Trying: new york 
## Candidates: NA NA NA NA NA NA NA NA NA NA 
## 2 2 2Trying: york 
## Candidates: NA NA NA NA NA NA NA NA NA NA
## [1] "check"
predict_next_word(ng_arr, "how are")
## length of words: 2 
## 2 1 2Trying: how are 
## Candidates: NA NA NA NA NA NA NA NA NA NA 
## 2 2 2Trying: are 
## Candidates: NA NA NA NA NA NA NA NA NA NA
## [1] "ifwomendidnotexist"
predict_next_word(ng_arr, "cool")
## length of words: 1 
## 1 1 1Trying: cool 
## Candidates: NA NA NA NA NA NA NA NA NA NA
## [1] "action"

Calculating perplexity

# Function to calculate perplexity for any n-gram frequency parameter
calculate_perplexity <- function(test_sentence, ngram_freq, n) {
  # Split the test sentence into words
  test_ngrams <- unlist(strsplit(test_sentence, " "))
  
  # Calculate the number of n-grams in the test sentence
  N <- length(test_ngrams) - (n - 1)
  
  # Initialize the log probability sum
  log_prob_sum <- 0
  
  # Loop through the test n-grams
  for (i in 1:N) {
    ngram <- paste(test_ngrams[i:(i + n - 1)], collapse = " ")
    if (ngram %in% names(ngram_freq)) {
      log_prob_sum <- log_prob_sum + log(ngram_freq[ngram] / sum(ngram_freq))
    } else {
      # Add a small probability for unseen n-grams (smoothing)
      log_prob_sum <- log_prob_sum + log(1 / (sum(ngram_freq) + 1))
    }
  }
  
  # Calculate perplexity
  perplexity <- exp(-log_prob_sum / N)
  return(perplexity)
}

# Example test sentence
test_sentence <- "this is an example sentence"

# "employees from submitting fraudulent invoices or to block employees from accessing 
# inappropriate websites were not in place"

# Calculate perplexity for trigrams
perplexity1 <- calculate_perplexity(test_sentence, ng1_freq, 1)
perplexity2 <- calculate_perplexity(test_sentence, ng2_freq, 2)
perplexity3 <- calculate_perplexity(test_sentence, ng3_freq, 3)

# Display perplexity
cat(perplexity1,perplexity2,perplexity3)
## 57208.44 352465 320093
#tests
tests = c('a case of', 'would mean the', 'make me the', 
          'struggling but the', 'date at the', 'be on my', 'in quite some',
          'with his little', 'faith during the',
          'you must be')

for(i in 1:length(tests)) {
 print(predict_next_word(ng4_freq, tests[i]))
}
## length of words: 3 
## 3 1 3Trying: a case of 
## Candidates: NA NA NA NA NA NA NA NA NA NA 
## 3 2 3Trying: case of 
## Candidates: NA NA NA NA NA NA NA NA NA NA 
## 3 3 3Trying: of 
## Candidates: NA NA NA NA NA NA NA NA NA NA 
## [1] "austin"
## length of words: 3 
## 3 1 3Trying: would mean the 
## Candidates: NA NA NA NA NA NA NA NA NA NA 
## 3 2 3Trying: mean the 
## Candidates: NA NA NA NA NA NA NA NA NA NA 
## 3 3 3Trying: the 
## Candidates: NA NA NA NA NA NA NA NA NA NA 
## [1] "lmao"
## length of words: 3 
## 3 1 3Trying: make me the 
## Candidates: NA NA NA NA NA NA NA NA NA NA 
## 3 2 3Trying: me the 
## Candidates: NA NA NA NA NA NA NA NA NA NA 
## 3 3 3Trying: the 
## Candidates: NA NA NA NA NA NA NA NA NA NA 
## [1] "“scripture"
## length of words: 3 
## 3 1 3Trying: struggling but the 
## Candidates: NA NA NA NA NA NA NA NA NA NA 
## 3 2 3Trying: but the 
## Candidates: NA NA NA NA NA NA NA NA NA NA 
## 3 3 3Trying: the 
## Candidates: NA NA NA NA NA NA NA NA NA NA 
## [1] ""
## length of words: 3 
## 3 1 3Trying: date at the 
## Candidates: NA NA NA NA NA NA NA NA NA NA 
## 3 2 3Trying: at the 
## Candidates: NA NA NA NA NA NA NA NA NA NA 
## 3 3 3Trying: the 
## Candidates: NA NA NA NA NA NA NA NA NA NA 
## [1] ""
## length of words: 3 
## 3 1 3Trying: be on my 
## Candidates: NA NA NA NA NA NA NA NA NA NA 
## 3 2 3Trying: on my 
## Candidates: NA NA NA NA NA NA NA NA NA NA 
## 3 3 3Trying: my 
## Candidates: NA NA NA NA NA NA NA NA NA NA 
## [1] "stir"
## length of words: 3 
## 3 1 3Trying: in quite some 
## Candidates: NA NA NA NA NA NA NA NA NA NA 
## 3 2 3Trying: quite some 
## Candidates: NA NA NA NA NA NA NA NA NA NA 
## 3 3 3Trying: some 
## Candidates: NA NA NA NA NA NA NA NA NA NA 
## [1] "🀦🀦"
## length of words: 3 
## 3 1 3Trying: with his little 
## Candidates: NA NA NA NA NA NA NA NA NA NA 
## 3 2 3Trying: his little 
## Candidates: NA NA NA NA NA NA NA NA NA NA 
## 3 3 3Trying: little 
## Candidates: NA NA NA NA NA NA NA NA NA NA 
## [1] "england’s"
## length of words: 3 
## 3 1 3Trying: faith during the 
## Candidates: NA NA NA NA NA NA NA NA NA NA 
## 3 2 3Trying: during the 
## Candidates: NA NA NA NA NA NA NA NA NA NA 
## 3 3 3Trying: the 
## Candidates: NA NA NA NA NA NA NA NA NA NA 
## [1] "beal"
## length of words: 3 
## 3 1 3Trying: you must be 
## Candidates: NA NA NA NA NA NA NA NA NA NA 
## 3 2 3Trying: must be 
## Candidates: NA NA NA NA NA NA NA NA NA NA 
## 3 3 3Trying: be 
## Candidates: NA NA NA NA NA NA NA NA NA NA 
## [1] "write"