This is the algorithm created for my Capstone Project for the Data Science Specialization by Johns Hopkins University on Coursera.
The goal of the Capstone Project was to implement a model for predict a word given one or more words and to develop a web app to use it.
Below you can find how I created the tables used for the app.
In the next post there will be the code for the app.
------------------------------------------------------------------------
# 1. IMPORT DATA
# twitter
twitter <- readLines("C:/Documents/Studies/Capstone Project/Coursera-SwiftKey/final/en_US/en_US.twitter.txt", encoding="UTF-8")
twitter <- iconv(twitter, from = "latin1", to = "UTF-8", sub="")
# news
conn <- file("C:/Documents/Studies/Capstone Project/Coursera-SwiftKey/final/en_US/en_US.news.txt", open="rb")
news <- readLines(conn, encoding="UTF-8")
close(conn)
rm(conn)
# blogs
blogs <- readLines("C:/Documents/Studies/Capstone Project/Coursera-SwiftKey/final/en_US/en_US.blogs.txt", encoding="UTF-8")
blogs <- iconv(blogs, from = "latin1", to = "UTF-8", sub="")
DataComplete <- c(twitter,blogs,news)
# 2. CLEAN DATASET
# clean the Data
DataClean <- gsub("#"," ", DataComplete)
# DataClean <- gsub("\'"," ", DataClean) ??? Lo tengo o no?
DataClean <- gsub("\""," ", DataClean)
DataClean <- gsub("-"," ", DataClean)
DataClean <- gsub("_"," ", DataClean)
DataClean <- gsub(","," ", DataClean)
DataClean <- gsub(","," ", DataClean)
DataClean <- gsub(","," ", DataClean)
# split each line When there is a punctuation (except for comma).
DataClean <- unlist(strsplit(DataClean, ".", fixed=TRUE))
DataClean <- unlist(strsplit(DataClean, "!", fixed=TRUE))
DataClean <- unlist(strsplit(DataClean, "?", fixed=TRUE))
DataClean <- unlist(strsplit(DataClean, ":", fixed=TRUE))
DataClean <- unlist(strsplit(DataClean, ";", fixed=TRUE))
DataClean <- unlist(strsplit(DataClean, "(", fixed=TRUE))
DataClean <- unlist(strsplit(DataClean, ")", fixed=TRUE))
DataClean <- unlist(strsplit(DataClean, "/", fixed=TRUE))
# insert symbol | after each sentence
DataClean <- paste(DataClean,"|")
# convert in lower all the letter
DataClean <- tolower(DataClean)
# delete white spaces before and after any sentence
DataClean <- trimws(DataClean)
# convert double spaces or more in a single one
DataClean <- gsub("\\s+", " ", DataClean)
# divide the single words
DataClean <- unlist(strsplit(DataClean, "\\s"))
# 3. BIGRAM TABLE
library(quanteda)
# Two consecutive words
Data2Gram <- tokens_ngrams(DataClean, concatenator=" ")
# Delete couples with "|"
Data2Gram <- Data2Gram[!grepl("\\|", Data2Gram)]
# convert in data.frame
DF2gram <- data.frame(unlist(Data2Gram))
# find frequencies and order
Freq2Gram <- as.data.frame(table(DF2gram))
Freq2Gram <- Freq2Gram[order(-Freq2Gram$Freq),]
# separate words
Freq2Table <- data.frame(do.call('rbind', strsplit(as.character(Freq2Gram$DF2gram),' ',fixed=TRUE)),Freq2Gram$Freq)
# take the top for every word
Total2Gram <- Freq2Table[order(Freq2Table$Freq2Gram.Freq,decreasing=T),]
Top2Gram <- Total2Gram[!duplicated(Total2Gram$X1),]
# second word
library(dplyr)
Rest2Gram <- anti_join(Total2Gram, Top2Gram, by = c("X1" = "X1", "X2" = "X2"))
Rest2Gram <- Rest2Gram[order(Rest2Gram$Freq2Gram.Freq,decreasing=T),]
Second2Gram <- Rest2Gram[!duplicated(Rest2Gram$X1),]
# third word
Other2Gram <- anti_join(Rest2Gram, Second2Gram, by = c("X1" = "X1", "X2" = "X2"))
Other2Gram <- Other2Gram[order(Other2Gram$Freq2Gram.Freq,decreasing=T),]
Third2Gram <- Other2Gram[!duplicated(Other2Gram$X1),]
#table with 3 top bigram
WordBigram <- left_join(Top2Gram, Second2Gram, by=c("X1"="X1"))
WordBigram <- left_join(WordBigram, Third2Gram, by=c("X1"="X1"))
colnames(WordBigram) <- c("Start","First","Freq1","Second","Freq2","Third","Freq3")
# delete rows with starting word contains "<" or ">"
WordBigram <- WordBigram[!grepl("<", WordBigram$Start),]
WordBigram <- WordBigram[!grepl(">", WordBigram$Start),]
# clear other columns contain "<" or ">"
WordBigram <- as.data.frame(sapply(WordBigram,sub,pattern='<',replacement=NA))
WordBigram <- as.data.frame(sapply(WordBigram,sub,pattern='>',replacement=NA))
# SEARCH!
WordBigram %>% filter(Start == "word") %>% select(First,Second,Third)
# 4. TRIGRAM TABLE
# not enough memory for trigram with compete data in my notebook... I take a sample
set.seed(190587)
DataClean <- sample(DataClean, length(DataClean)*0.3)
# Three consecutive words
Data3Gram <- tokens_ngrams(DataClean, n = 3L, concatenator=" ")
# Delete couples with "|"
Data3Gram <- Data3Gram[!grepl("\\|", Data3Gram)]
# convert in data.frame
DF3gram <- data.frame(unlist(Data3Gram))
# find frequencies and order
Freq3Gram <- as.data.frame(table(DF3gram))
Freq3Gram <- Freq3Gram[order(-Freq3Gram$Freq),]
Freq3Table <- data.frame(do.call('rbind', strsplit(as.character(Freq3Gram$DF3gram),' ',fixed=TRUE)),Freq3Gram$Freq)
# filter only if frequency > 1
Freq3Table <- filter(Freq3Table, Freq3Gram.Freq>1)
# separate bigram for other word
Freq3Table <- mutate(Freq3Table,X=paste(X1,X2))
Freq3Table <- select(Freq3Table,X,X3,Freq3Gram.Freq)
# take the top for every word
Total3Gram <- Freq3Table[order(Freq3Table$Freq3Gram.Freq,decreasing=T),]
Top3Gram <- Total3Gram[!duplicated(Total3Gram$X),]
# second word
Rest3Gram <- anti_join(Total3Gram, Top3Gram, by = c("X" = "X", "X3" = "X3"))
Rest3Gram <- Rest3Gram[order(Rest3Gram$Freq3Gram.Freq,decreasing=T),]
Second3Gram <- Rest3Gram[!duplicated(Rest3Gram$X),]
# third word
Other3Gram <- anti_join(Rest3Gram, Second3Gram, by = c("X" = "X", "X3" = "X3"))
Other3Gram <- Other3Gram[order(Other3Gram$Freq3Gram.Freq,decreasing=T),]
Third3Gram <- Other3Gram[!duplicated(Other3Gram$X),]
#table with 3 top bigram
WordTrigram <- left_join(Top3Gram, Second3Gram, by=c("X"="X"))
WordTrigram <- left_join(WordTrigram, Third3Gram, by=c("X"="X"))
colnames(WordTrigram) <- c("Start","First","Freq1","Second","Freq2","Third","Freq3")
# delete rows with starting word contains "<" or ">"
WordTrigram <- WordTrigram[!grepl("<", WordTrigram$Start),]
WordTrigram <- WordTrigram[!grepl(">", WordTrigram$Start),]
# clear other columns contain "<" or ">"
WordTrigram <- as.data.frame(sapply(WordTrigram,sub,pattern='<',replacement=NA))
WordTrigram <- as.data.frame(sapply(WordTrigram,sub,pattern='>',replacement=NA))
# SEARCH!
WordTrigram %>% filter(Start == "as soon") %>% select(First,Second,Third)
# 5. QUADRIGRAM TABLE
# not enough memory for quadrigram with compete data in my notebook... I take a sample
set.seed(190587)
DataClean <- sample(DataClean, length(DataClean)*0.25)
# Three consecutive words
Data4Gram <- tokens_ngrams(DataClean, n = 4L, concatenator=" ")
# Delete couples with "|"
Data4Gram <- Data4Gram[!grepl("\\|", Data4Gram)]
# convert in data.frame
DF4gram <- data.frame(unlist(Data4Gram))
# find frequencies and order
Freq4Gram <- as.data.frame(table(DF4gram))
Freq4Gram <- Freq4Gram[order(-Freq4Gram$Freq),]
# separate trigram for other word, take only if there is > 1 frequency
Freq4GramFilter <- filter(Freq4Gram, Freq>1)
Freq4Table <- data.frame(do.call('rbind', strsplit(as.character(Freq4GramFilter$DF4gram),' ',fixed=TRUE)),Freq4GramFilter$Freq)
Freq4Table <- mutate(Freq4Table,X=paste(X1,X2,X3))
Freq4Table <- select(Freq4Table,X,X4,Freq4GramFilter.Freq)
# take the top for every word
Total4Gram <- Freq4Table[order(Freq4Table$Freq4GramFilter.Freq,decreasing=T),]
Top4Gram <- Total4Gram[!duplicated(Total4Gram$X),]
# second word
Rest4Gram <- anti_join(Total4Gram, Top4Gram, by = c("X" = "X", "X4" = "X4"))
Rest4Gram <- Rest4Gram[order(Rest4Gram$Freq4GramFilter.Freq,decreasing=T),]
Second4Gram <- Rest4Gram[!duplicated(Rest4Gram$X),]
# third word
Other4Gram <- anti_join(Rest4Gram, Second4Gram, by = c("X" = "X", "X4" = "X4"))
Other4Gram <- Other4Gram[order(Other4Gram$Freq4GramFilter.Freq,decreasing=T),]
Third4Gram <- Other4Gram[!duplicated(Other4Gram$X),]
#table with 3 top bigram
WordQuadrigram <- left_join(Top4Gram, Second4Gram, by=c("X"="X"))
WordQuadrigram <- left_join(WordQuadrigram, Third4Gram, by=c("X"="X"))
colnames(WordQuadrigram) <- c("Start","First","Freq1","Second","Freq2","Third","Freq3")
# delete rows with starting word contains "<" or ">"
WordQuadrigram <- WordQuadrigram[!grepl("<", WordQuadrigram$Start),]
WordQuadrigram <- WordQuadrigram[!grepl(">", WordQuadrigram$Start),]
# clear other columns contain "<" or ">"
WordQuadrigram <- as.data.frame(sapply(WordQuadrigram,sub,pattern='<',replacement=NA))
WordQuadrigram <- as.data.frame(sapply(WordQuadrigram,sub,pattern='>',replacement=NA))
# SEARCH!
WordQuadrigram %>% filter(Start == "as soon as") %>% select(First,Second,Third)
Categories
Bash
(3)
BOT
(2)
C#
(1)
Cluster Analysis
(1)
Data Cleaning
(6)
Data Ingestion
(2)
Data Science Specialization
(10)
Data Visualization
(15)
ggplot2
(1)
Hadoop
(1)
Hashnode
(3)
Machine Learning
(5)
MapReduce
(1)
Maps
(1)
Markdown
(7)
Market Basket Analysis
(1)
MATLAB
(1)
Matplotlib
(3)
Numpy
(2)
Octave
(1)
Pandas
(3)
Python
(17)
R
(22)
Regression
(7)
scikit-learn
(1)
Seaborn
(1)
Shell
(3)
Shiny App
(1)
SSIS
(3)
Statistical Inference
(2)
T-SQL
(8)
Unix
(3)
No comments:
Post a Comment