PoS Regex Pattern matching for ML based Tagging
using CSV
using DataFrames
global myfile =string(ARGS[1]) Input PoS files for feature calculation e.g. #filename= "/home/rajeev/myProjects/nounlist.txt"
mydf = CSV.read(myfile,header = false)
# print header for output file
println("Word,Length,Vowel Whole,Vowel Last2,Consonant Whole,Consonant Last2,Diagraph Whole, Diagraph Last2, Noun pattern, Verb pattern, Adverb pattern, Adjective pattern,Class")
for myrow in eachrow(mydf)
#count no of myVowels
mysentence= string(myrow[end])
if length(mysentence) > 2 && isascii(mysentence)
wordending= mysentence[end-1:end]
#Regexes start for various patterns linked to character patterns
myVowelsRegex = r"a|e|i|o|u"
mydigraphsRegex = r"ch|ci|ck|gh|ng|ph|qu|rh|sc|sh|th|ti|wh|wr|zh"
nounRegexSuffixes = r"ion|sion|tion|acy|ance|ence|hood|ar|or|ism|ist|ment|ness|u|ity"
verbRegexSuffixes = r"ify|ate|ize|en"
adjectiveRegexSuffixes = r"al|ful|ly|ic|ish|like|our|y|ate|able|ible"
adverbRegexSuffixes = r"ly"
global nounlast = length(collect(eachmatch(nounRegexSuffixes, mysentence[end-2:end])))
global verblast = length(collect(eachmatch(verbRegexSuffixes, mysentence[end-2:end])))
global adjectivelast = length(collect(eachmatch(adjectiveRegexSuffixes, mysentence[end-2:end])))
global adverblast = length(collect(eachmatch(adverbRegexSuffixes, mysentence[end-2:end])))
global vowelsWhole= length(collect(eachmatch(myVowelsRegex, mysentence)))
global vowelsLast2= length(collect(eachmatch(myVowelsRegex, wordending)))
global consonantsWhole= length(mysentence)-vowelsWhole
global consonantsLast2= 2-vowelsLast2
global digraphWhole = length(collect(eachmatch(mydigraphsRegex, mysentence)))
global digraphLast2 = length(collect(eachmatch(mydigraphsRegex, wordending)))
#println(mysentence,",",adjectivelast,",",mysentence[end-2:end])
println(mysentence,",",length(mysentence),",",vowelsWhole,",",vowelsLast2,",",consonantsWhole,",",consonantsLast2,",",digraphWhole,",",digraphLast2,",",nounlast,",",verblast,",",adverblast,",",adjectivelast,",","$myfile")
end
end