PoS Regex Pattern matching for ML based Tagging

using CSV

using DataFrames

global myfile =string(ARGS[1]) Input PoS files for feature calculation e.g. #filename= "/home/rajeev/myProjects/nounlist.txt"

mydf = CSV.read(myfile,header = false)

# print header for output file

println("Word,Length,Vowel Whole,Vowel Last2,Consonant Whole,Consonant Last2,Diagraph Whole, Diagraph Last2, Noun pattern, Verb pattern, Adverb pattern, Adjective pattern,Class")

for myrow in eachrow(mydf)

   #count no of myVowels

   mysentence= string(myrow[end])

   if length(mysentence) > 2 && isascii(mysentence)


       wordending= mysentence[end-1:end]

#Regexes start for various patterns linked to character patterns

       myVowelsRegex = r"a|e|i|o|u"

       mydigraphsRegex = r"ch|ci|ck|gh|ng|ph|qu|rh|sc|sh|th|ti|wh|wr|zh"

       nounRegexSuffixes = r"ion|sion|tion|acy|ance|ence|hood|ar|or|ism|ist|ment|ness|u|ity"

       verbRegexSuffixes = r"ify|ate|ize|en"

       adjectiveRegexSuffixes = r"al|ful|ly|ic|ish|like|our|y|ate|able|ible"

       adverbRegexSuffixes = r"ly"


       global nounlast = length(collect(eachmatch(nounRegexSuffixes, mysentence[end-2:end])))

       global verblast = length(collect(eachmatch(verbRegexSuffixes, mysentence[end-2:end])))

       global adjectivelast = length(collect(eachmatch(adjectiveRegexSuffixes, mysentence[end-2:end])))

       global adverblast = length(collect(eachmatch(adverbRegexSuffixes, mysentence[end-2:end])))

       global vowelsWhole= length(collect(eachmatch(myVowelsRegex, mysentence)))

       global vowelsLast2= length(collect(eachmatch(myVowelsRegex, wordending)))

       global consonantsWhole= length(mysentence)-vowelsWhole

       global consonantsLast2= 2-vowelsLast2

       global digraphWhole = length(collect(eachmatch(mydigraphsRegex, mysentence)))

       global digraphLast2 = length(collect(eachmatch(mydigraphsRegex, wordending)))


       #println(mysentence,",",adjectivelast,",",mysentence[end-2:end])

       println(mysentence,",",length(mysentence),",",vowelsWhole,",",vowelsLast2,",",consonantsWhole,",",consonantsLast2,",",digraphWhole,",",digraphLast2,",",nounlast,",",verblast,",",adverblast,",",adjectivelast,",","$myfile")

   end

end

要查看或添加评论,请登录

Rajeev Gangal的更多文章

社区洞察

其他会员也浏览了