Kotz’i’jに恋して16

Posted on 2017-06-01 by 13 No'j

ここ数日間取り組んできたマルコフ連鎖による文書生成アルゴリズムをようやくTzijonikプログラムに取り入れた。重要な更新なのでバージョンを２にした。　慣れないRubyで色々とメソッドを書いたので、ttr_readerとか今まで曖昧だった部分も理解できる様になった。

コードはやはりDictionary.rbとResponder.rbへの追加修正が主。Dictionary.rbにはマルコフ連鎖関連の下記コードを追加：

  def studyMarkov(input)
    words = []
    fragments = input.split
    count = 0
    fragments.each do |word|
     if count == 0 then
       word = "%START%" + word
     end   
     words.push(word)
     count += 1
    end
	unless words.size < 3
      for i in 0..words.size - 2 do 
        next if words[i].include?(".") or words[i].include?("?")
  
        if words[i+2] == nil or words[i+1].include?(".") or words[i+1].include?("?") then 
          @markov << [words[i], words[i+1], "%END%"] 
        elsif words[i+2].include?(".") or words[i+2].include?("?") then
          @markov << [words[i], words[i+1], words[i+2] + "%END%"] 
        else
          @markov << [words[i], words[i+1], words[i+2]] 
        end
      end
    end	
  end

  def saveMarkov
    open('dictionaries/MarkovDic.txt', 'w') do |f|
	  markov.each do |a, b, c|
        f.puts([a + " " + b + " " + c])
      end
    end
  end

def studyMarkov(input)

words = []

fragments = input.split

count = 0

fragments.each do |word|

if count == 0 then

word = "%START%" + word

end

words.push(word)

count += 1

end

unless words.size < 3

for i in 0..words.size - 2 do

next if words[i].include?(".") or words[i].include?("?")

if words[i+2] == nil or words[i+1].include?(".") or words[i+1].include?("?") then

@markov << [words[i], words[i+1], "%END%"]

elsif words[i+2].include?(".") or words[i+2].include?("?") then

@markov << [words[i], words[i+1], words[i+2] + "%END%"]

else

@markov << [words[i], words[i+1], words[i+2]]

end

def saveMarkov

open('dictionaries/MarkovDic.txt', 'w') do |f|

markov.each do |a, b, c|

f.puts([a + " " + b + " " + c])

end

それからResponder.rbにはマルコフ連鎖で回答を作成するクラスを追加：

class MarkovResponder < Responder
  def response(input, mood)
    count = 0
    suffix = ""
    newSentence = ""

    while count < 100
      if newSentence == "" then
        startCandidates = []
        candidatesCount = 0
        @dictionary.markov.each do |a, b, c|
          if a.include?("%START%") then
            startCandidates << [a, b, c]
            candidatesCount += 1
          end    
        end  
    
        r = rand(candidatesCount)
        a = startCandidates[r][0]
        b = startCandidates[r][1]
        c = startCandidates[r][2]
        newSentence = a + " " + b + " " + c
        suffix = c
        count += 1 
      else
        rowCount = 0
        candidates = []
        @dictionary.markov.each do |a, b, c|
          if suffix == a then;
            candidates << [a, b, c]
            rowCount += 1
          end      
        end
    
        r = rand(rowCount)
        b = candidates[r][1]
        c = candidates[r][2]
        newSentence += " " + b + " " + c
        suffix = c
        count += 1
      end
    
      if suffix.include?("%END%") then
        newSentence.gsub!("%START%", "")
        newSentence.gsub!("%END%", "")
        return newSentence
        break
      end
    end  
  end
end

class MarkovResponder < Responder

def response(input, mood)

count = 0

suffix = ""

newSentence = ""

while count < 100

if newSentence == "" then

startCandidates = []

candidatesCount = 0

@dictionary.markov.each do |a, b, c|

if a.include?("%START%") then

startCandidates << [a, b, c]

candidatesCount += 1

end

r = rand(candidatesCount)

a = startCandidates[r][0]

b = startCandidates[r][1]

c = startCandidates[r][2]

newSentence = a + " " + b + " " + c

suffix = c

count += 1

else

rowCount = 0

candidates = []

@dictionary.markov.each do |a, b, c|

if suffix == a then;

candidates << [a, b, c]

rowCount += 1

end

r = rand(rowCount)

b = candidates[r][1]

c = candidates[r][2]

newSentence += " " + b + " " + c

suffix = c

count += 1

end

if suffix.include?("%END%") then

newSentence.gsub!("%START%", "")

newSentence.gsub!("%END%", "")

return newSentence

break

end

プログラムを起動してみる。

5割の確率でマルコフ連鎖で生成された文章で返事をするようにしている。ただ、入力した文章からキーワードを拾っている訳ではないので、意味のある会話にはなりにくい。でもKotzi’ijがある程度長い返答を出来るようにはなった。

今後はキーワードを拾う仕様にすることと、文法を考慮した上でマルコフ連鎖アルゴリズムに修正を加える必要がある。ただ、そうするとマルコフ連鎖では無くなってしまうけど。

Kotz’i’jに恋して15：マルコフ連鎖(マヤ・キチェ語その２)

Posted on 2017-06-01 by 13 No'j

昨日載せたコードを少し修正してみた。辞書ファイルへの書き込みも加えた。

MarkovChain.rb

def paragraphSplit(text)
  words = []
  temp = ""
  input = ""
  open(text) do |f|
    f.each do |line|
     temp << line
    end
  end
  input << temp
  sentence = input.scan(/[^.?!]*./)
  sentence.each do |s|
    fragments = s.split
    count = 0
    fragments.each do |word|
     if count == 0 then
       word = "%START%" + word
     end   
     words.push(word)
     count += 1
    end
  end
  return words
end

def writeASentence(markov)
  count = 0
  suffix = ""
  newSentence = ""

  while count < 100
    if newSentence == "" then
      startCandidates = []
      candidatesCount = 0
      markov.each do |a, b, c|
        if a.include?("%START%") then
          startCandidates << [a, b, c]
          candidatesCount += 1
        end    
      end  
    
      r = rand(candidatesCount)
      a = startCandidates[r][0]
      b = startCandidates[r][1]
      c = startCandidates[r][2]
      newSentence = a + " " + b + " " + c
      suffix = c
      count += 1 
    else
      rowCount = 0
      candidates = []
      markov.each do |a, b, c|
        if suffix == a then;
          candidates << [a, b, c]
          rowCount += 1
        end      
      end
    
      r = rand(rowCount)
      b = candidates[r][1]
      c = candidates[r][2]
      newSentence += " " + b + " " + c
      suffix = c
      count += 1
    end
    
    if suffix.include?("%END%") then
      newSentence.gsub!("%START%", "")
      newSentence.gsub!("%END%", "")
      return newSentence
      break
    end
  end
end

def markovDic(words)
  unless words.size < 3
    markov = []
    for i in 0..words.size - 2 do 
      next if words[i].include?(".") or words[i].include?("?")
  
      if words[i+2] == nil or words[i+1].include?(".") or words[i+1].include?("?") then 
        markov << [words[i], words[i+1], "%END%"] 
      elsif words[i+2].include?(".") or words[i+2].include?("?") then
        markov << [words[i], words[i+1], words[i+2] + "%END%"] 
      else
        markov << [words[i], words[i+1], words[i+2]] 
      end
    end
  end
  return markov
end

def paragraphSplit(text)

words = []

temp = ""

input = ""

open(text) do |f|

f.each do |line|

temp << line

end

input << temp

sentence = input.scan(/[^.?!]*./)

sentence.each do |s|

fragments = s.split

count = 0

fragments.each do |word|

if count == 0 then

word = "%START%" + word

end

words.push(word)

count += 1

end

return words

end

def writeASentence(markov)

count = 0

suffix = ""

newSentence = ""

while count < 100

if newSentence == "" then

startCandidates = []

candidatesCount = 0

markov.each do |a, b, c|

if a.include?("%START%") then

startCandidates << [a, b, c]

candidatesCount += 1

end

r = rand(candidatesCount)

a = startCandidates[r][0]

b = startCandidates[r][1]

c = startCandidates[r][2]

newSentence = a + " " + b + " " + c

suffix = c

count += 1

else

rowCount = 0

candidates = []

markov.each do |a, b, c|

if suffix == a then;

candidates << [a, b, c]

rowCount += 1

end

r = rand(rowCount)

b = candidates[r][1]

c = candidates[r][2]

newSentence += " " + b + " " + c

suffix = c

count += 1

end

if suffix.include?("%END%") then

newSentence.gsub!("%START%", "")

newSentence.gsub!("%END%", "")

return newSentence

break

end

def markovDic(words)

unless words.size < 3

markov = []

for i in 0..words.size - 2 do

next if words[i].include?(".") or words[i].include?("?")

if words[i+2] == nil or words[i+1].include?(".") or words[i+1].include?("?") then

markov << [words[i], words[i+1], "%END%"]

elsif words[i+2].include?(".") or words[i+2].include?("?") then

markov << [words[i], words[i+1], words[i+2] + "%END%"]

else

markov << [words[i], words[i+1], words[i+2]]

end

return markov

end

MarkovChainTestKiche.rb

require '.\MarkovChain'

words = []
markov = []
words = paragraphSplit('./MINEDUC.txt')
markov = markovDic(words)

count = 1
while count <=10 do
  sentence = writeASentence(markov)
  puts ("Sentence " + count.to_s + ": ")
  puts sentence
  puts""
  count += 1
end

open('./MarkovDic.txt', 'w') do |f|
  markov.each do |a, b, c|
    f.puts([a + " " + b + " " + c])
  end
end

require '.\MarkovChain'

words = []

markov = []

words = paragraphSplit('./MINEDUC.txt')

markov = markovDic(words)

count = 1

while count <=10 do

sentence = writeASentence(markov)

puts ("Sentence " + count.to_s + ": ")

puts sentence

puts""

count += 1

end

open('./MarkovDic.txt', 'w') do |f|

markov.each do |a, b, c|

f.puts([a + " " + b + " " + c])

end

辞書ファイルの読み込みはこの様に：

markov = []
open('./MarkovDic.txt') do |f|
  f.each do |line|
    a, b, c = line.split
	markov << [a, b, c]
  end
end

markov = []

open('./MarkovDic.txt') do |f|

f.each do |line|

a, b, c = line.split

markov << [a, b, c]

end

単純だけど必要なことは全てこれらのコードで出来ている。しかし、コードを見ると統計分析を行っているような書き方になっている。癖が抜けないなぁと思う。

Kotz’i’jに恋して14：マルコフ連鎖(マヤ・キチェ語)

Posted on 2017-05-31 by 13 No'j

キチェ語のテキストを何点か用意したのでマルコフ連鎖を用いたキチェ語の文章作成を試みた。

ここでは４点紹介する。まずはグアテマラ教育省の絵本数冊の文章をテキストとして採り入れた場合。

Sentence 1:
Pix pix, pix tz'ikin rumal kenumik.

Sentence 2:
¡In ri' ra'nan! Sib'alaj maltyox at am, xattob'an che jun.

Sentence 3:
Ri winaq xkimajij utikik che'.

Sentence 4:
Ri kinan kitat xub'ij jun saqb'in tajin kutij kichapik le alaj kuk.

Sentence 5:
wachanim xa oxlajuj chik k'olik.

Sentence 6:
Wachanim xa kajib' chik k'olik.

Sentence 7:
B'e, b'e, b'e kacha' jun oqxa'n.

Sentence 8:
Ri che’ k’o jun me’s chuchi’ ri jastaq rech.

Sentence 9:
Ri inlaj ala ¿Jas le alaj taq ak', le jun riqow ib', k'ate k'uri' sib'alaj kaki'kotik, aq'ab' xwa'lijik, xa rumal xutaq uloq we xtijtaj jastaq rumal man xk'iy taj.

Sentence 10:
Jun nim tata’ xub’ij che le wachi'l.

Sentence 1:

Pix pix, pix tz'ikin rumal kenumik.

Sentence 2:

¡In ri' ra'nan! Sib'alaj maltyox at am, xattob'an che jun.

Sentence 3:

Ri winaq xkimajij utikik che'.

Sentence 4:

Ri kinan kitat xub'ij jun saqb'in tajin kutij kichapik le alaj kuk.

Sentence 5:

wachanim xa oxlajuj chik k'olik.

Sentence 6:

Wachanim xa kajib' chik k'olik.

Sentence 7:

B'e, b'e, b'e kacha' jun oqxa'n.

Sentence 8:

Ri che’ k’o jun me’s chuchi’ ri jastaq rech.

Sentence 9:

Ri inlaj ala ¿Jas le alaj taq ak', le jun riqow ib', k'ate k'uri' sib'alaj kaki'kotik, aq'ab' xwa'lijik, xa rumal xutaq uloq we xtijtaj jastaq rumal man xk'iy taj.

Sentence 10:

Jun nim tata’ xub’ij che le wachi'l.

次にマイクロソフトのサービス・アグリーメント。

Sentence 1:
Xuquje', wene' juntaq tz'ib'wuj chik ri q'atem eche che taq patanib'al riqom apan qumal uj ruk'a'm rib' ruk' ri toq'inik ri'.

Sentence 2:
We je ka'el cho qawach chi k'o ta k'o apanoq chupam ri Pataninik kech taq b'anowem ri Pataninik ri Pataninik tajin kchakux na (versi?

Sentence 3:
Arechi k'o jun urox, rumal Microsoft( jas kub'an Microsoft Studios, kojkemchak, pataninik xuquje' salob'em wachib'al sik'inik on upam ri patanib'al on B'ixkil ri Ech lal, jas ta apanoq ri Pataninik rech le software xuquje' uqasaxik xuquje' che jun tojom uperaj ri patanib'al, on ri k'olib'al pa nimk'at uchapom rib' ruk' ri b'ixkil chi uxe' taq k'utunik on wene' kuk'exla' rib' jas ri kk'ulmatajik arechi' tz'aptal ri k'olib'al lal pa http://support.

Sentence 4:
We man kk'am lal apan on kkoj lal k'ya taq uwach toq'inik ri Windows, k'ate k'uri' ri jak'ayb'al arechi' b'im chi uxe' taq Urox.

Sentence 5:
Ri ilob'al rech Microsoft, k'olib'al lal pa Skype on che uk'amik taq le wokaj la on kajach apanoq chi upam taq uwach che jun tojom uperaj ri Microsoft chisaq uya'om b'e lal ka'ok ta lal kk'amwaj lal ri k'olib'al pa Skype xuquje', we tajin kkoj lal apan ri b'i'aj jas jun b'ixkil (jas ta ne', ujachik q'ojom ri q'atem eche che taq yuk'unem.

Sentence 6:
Tza eqle'n lal kiPataninik taq ri Pataninik; xuquje' ri Pataninik on nuk'um ch'ich'; wuqub'.

Sentence 7:
We je ri' wene' man kutoq'aj taj kq'axej lal cho taqanik" jawje chi' ka'el wi jas ri k'olib'al k'o jas jun che uq'echik we taqanik.

Sentence 8:
We je ri' k'o upatan.

Sentence 9:
Tzij b'anom che taq ri Jak'ayb'al rech Windows.

Sentence 10:
Eqle'n lal uya'ik apan ronojel ri Upam Ech lal man kuq'ech ta apachike jachanik che Microsoft man rumal ta wujil che juntaq tz'ib'wuj on uwach chik ri chakub'al on che ya'tal rumal ri ya'om b'i lal jun nim apan k'ex che, kuya' kaqatoq'ij apan chech ukojik le patanib'al rech Office, ri yuk'unem, k'o ch'ich'am eche lal kuya' ta apan kumal nantat che okem che juperaj on ronojel taq pataninik ya'om kumal jub'an chik che taq rachi'l chak ri nab'e kanoq xuquje' che ri Pataninik wene' ktoq'ix lal rech Microsoft, we je kelik q'echonik kb'an lal.

Sentence 1:

Xuquje', wene' juntaq tz'ib'wuj chik ri q'atem eche che taq patanib'al riqom apan qumal uj ruk'a'm rib' ruk' ri toq'inik ri'.

Sentence 2:

We je ka'el cho qawach chi k'o ta k'o apanoq chupam ri Pataninik kech taq b'anowem ri Pataninik ri Pataninik tajin kchakux na (versi?

Sentence 3:

Arechi k'o jun urox, rumal Microsoft( jas kub'an Microsoft Studios, kojkemchak, pataninik xuquje' salob'em wachib'al sik'inik on upam ri patanib'al on B'ixkil ri Ech lal, jas ta apanoq ri Pataninik rech le software xuquje' uqasaxik xuquje' che jun tojom uperaj ri patanib'al, on ri k'olib'al pa nimk'at uchapom rib' ruk' ri b'ixkil chi uxe' taq k'utunik on wene' kuk'exla' rib' jas ri kk'ulmatajik arechi' tz'aptal ri k'olib'al lal pa http://support.

Sentence 4:

We man kk'am lal apan on kkoj lal k'ya taq uwach toq'inik ri Windows, k'ate k'uri' ri jak'ayb'al arechi' b'im chi uxe' taq Urox.

Sentence 5:

Ri ilob'al rech Microsoft, k'olib'al lal pa Skype on che uk'amik taq le wokaj la on kajach apanoq chi upam taq uwach che jun tojom uperaj ri Microsoft chisaq uya'om b'e lal ka'ok ta lal kk'amwaj lal ri k'olib'al pa Skype xuquje', we tajin kkoj lal apan ri b'i'aj jas jun b'ixkil (jas ta ne', ujachik q'ojom ri q'atem eche che taq yuk'unem.

Sentence 6:

Tza eqle'n lal kiPataninik taq ri Pataninik; xuquje' ri Pataninik on nuk'um ch'ich'; wuqub'.

Sentence 7:

We je ri' wene' man kutoq'aj taj kq'axej lal cho taqanik" jawje chi' ka'el wi jas ri k'olib'al k'o jas jun che uq'echik we taqanik.

Sentence 8:

We je ri' k'o upatan.

Sentence 9:

Tzij b'anom che taq ri Jak'ayb'al rech Windows.

Sentence 10:

Eqle'n lal uya'ik apan ronojel ri Upam Ech lal man kuq'ech ta apachike jachanik che Microsoft man rumal ta wujil che juntaq tz'ib'wuj on uwach chik ri chakub'al on che ya'tal rumal ri ya'om b'i lal jun nim apan k'ex che, kuya' kaqatoq'ij apan chech ukojik le patanib'al rech Office, ri yuk'unem, k'o ch'ich'am eche lal kuya' ta apan kumal nantat che okem che juperaj on ronojel taq pataninik ya'om kumal jub'an chik che taq rachi'l chak ri nab'e kanoq xuquje' che ri Pataninik wene' ktoq'ix lal rech Microsoft, we je kelik q'echonik kb'an lal.

マイクロソフト関連の単語が見られる。次に聖書の一部分（フィリピ人への手紙）から。

Sentence 1:
coj, chijiquiba' ba' u k'ij sak! Je chbanok.

Sentence 2:
ch ajchac, chebato' ri ixokib ri'.

Sentence 3:
yo'w nu chuk'ab.

Sentence 4:
Xane' are quinwaj quinwil i tom, xukuje' ri c'olem pa ri Crist Jesus.

Sentence 5:
Sibalaj quinquicot pa ri mebayil xukuje' ri nu tzij ri i ch'uch'ujil chetamax cumal conojel ri winak.

Sentence 6:
Ri in quinquicotic pune' man ix cuininak ta che choman pa ri Ajawaxel.

Sentence 7:
Weta'm chi xa' rech c?

Sentence 8:
ch ajchac, chebato' ri ixokib ri'.

Sentence 9:
chajin na ri Ajawaxel, lok'alaj tak wachalal, ri sibalaj quinwaj quinwil i w?

Sentence 10:
C'o jun jasach mixoc wi il, xane' xuwi ri nu tzij re colobal ib, aretak xinito' pa ri wuj re c'aslemal.

Sentence 1:

coj, chijiquiba' ba' u k'ij sak! Je chbanok.

Sentence 2:

ch ajchac, chebato' ri ixokib ri'.

Sentence 3:

yo'w nu chuk'ab.

Sentence 4:

Xane' are quinwaj quinwil i tom, xukuje' ri c'olem pa ri Crist Jesus.

Sentence 5:

Sibalaj quinquicot pa ri mebayil xukuje' ri nu tzij ri i ch'uch'ujil chetamax cumal conojel ri winak.

Sentence 6:

Ri in quinquicotic pune' man ix cuininak ta che choman pa ri Ajawaxel.

Sentence 7:

Weta'm chi xa' rech c?

Sentence 8:

ch ajchac, chebato' ri ixokib ri'.

Sentence 9:

chajin na ri Ajawaxel, lok'alaj tak wachalal, ri sibalaj quinwaj quinwil i w?

Sentence 10:

C'o jun jasach mixoc wi il, xane' xuwi ri nu tzij re colobal ib, aretak xinito' pa ri wuj re c'aslemal.

スペル等が現在一般的に用いられているキチェ語のものと異なる。最後にキチェ族の聖典ポポル・ブフの一部から。

Sentence 1:
Ta xkich'ik k'ut e nab'e uchan: maja' b'i oq jun winaq, jun chikop, nima chikop ri ulew xuya' o Alom, K'ajolom; xutzininaq chik xek'oje wi ri kaj, ulew! Mata k'ut uq'ijilab'al, uq'ala'ib'al ri kej, tz'ikin; rumal Tz'aqol, B'itol; Uchuch, Uqajaw k'aslem, Winaqirem: Ab'anel, K'uxlanel; Alayrech, K'uxlay rech saqil amaqil; saqil al, saqil k' a'm.

Sentence 2:
Xecha' k'ut.

Sentence 3:
Ta xe'uchax chi k'oje chajal k' a'm.

Sentence 4:
Xecha' k'ut.

Sentence 5:
Lib'aj chi' 8 xwinaqirik: k' akalolinik, katolona puch upa kaj, upa ulew; kaj k' o wi.

Sentence 6:
Ta xkitzijoj ronojel a ta chawaxoq, ta chi k'is tz'uq ronojel a ta xq'alaj, ta xna'ojixik saq petenaq ch'aqa palo utzijoxik qamujib'al, ilb'al re Popa Wuj , ilb'al saq petenaq ch'aqa palo utzijoxik puch ewaxib'al, saqirib' al rumal Tz' aqol, B'itol, Alom, K'ajolom kib'i'.

Sentence 7:
E nimaq etamanel, e nimaq etamanel, e nab'e xkinojij, xkitzijoj puch; jusuk' xwinaqir ulew, juyub'-taq'aj; 10 xch'ob'och'ox ub'e ja': xb'inije'ik k'ole je raqan xo'l taq juyub' xa kachamanik, katz'ininik chi q'equ'm, chi aq'ab'.

Sentence 8:
Nim upe' oxik, utzijoxik puch, xa ewal uwach ilol re, b'isol re.

Sentence 9:
xchiqelesaj rumal maja b'i chik kiq'ij; xraj k'u kitij chik kiq'ij; xraj kitijtob'ej chik, xraj kitijtob'ej chik, Raxa Kaqulja; rox chik, Raxa Kaqulja; rox chik, xraj pu kinuk' chik ilb'al re ki' chelaj ronojel uwinaqil juyub': ri kej, tz'ikin; rumal Tz'aqol, B'itol, Alom, K'ajolom; xutzininaq chik ronojel ruk' xkib'an chik chi lolinik, ma xnawachir wi k'ut, xa remanik ja', xa li' anik palo, xa utukel ri Tz' aqol, B'itol, rnawi mixutzinik, mawi mixixch' awik.

Sentence 10:
Lib'aj chi' 8 xwinaqirik: k' o wi.

Sentence 1:

Ta xkich'ik k'ut e nab'e uchan: maja' b'i oq jun winaq, jun chikop, nima chikop ri ulew xuya' o Alom, K'ajolom; xutzininaq chik xek'oje wi ri kaj, ulew! Mata k'ut uq'ijilab'al, uq'ala'ib'al ri kej, tz'ikin; rumal Tz'aqol, B'itol; Uchuch, Uqajaw k'aslem, Winaqirem: Ab'anel, K'uxlanel; Alayrech, K'uxlay rech saqil amaqil; saqil al, saqil k' a'm.

Sentence 2:

Xecha' k'ut.

Sentence 3:

Ta xe'uchax chi k'oje chajal k' a'm.

Sentence 4:

Xecha' k'ut.

Sentence 5:

Lib'aj chi' 8 xwinaqirik: k' akalolinik, katolona puch upa kaj, upa ulew; kaj k' o wi.

Sentence 6:

Ta xkitzijoj ronojel a ta chawaxoq, ta chi k'is tz'uq ronojel a ta xq'alaj, ta xna'ojixik saq petenaq ch'aqa palo utzijoxik qamujib'al, ilb'al re Popa Wuj , ilb'al saq petenaq ch'aqa palo utzijoxik puch ewaxib'al, saqirib' al rumal Tz' aqol, B'itol, Alom, K'ajolom kib'i'.

Sentence 7:

E nimaq etamanel, e nimaq etamanel, e nab'e xkinojij, xkitzijoj puch; jusuk' xwinaqir ulew, juyub'-taq'aj; 10 xch'ob'och'ox ub'e ja': xb'inije'ik k'ole je raqan xo'l taq juyub' xa kachamanik, katz'ininik chi q'equ'm, chi aq'ab'.

Sentence 8:

Nim upe' oxik, utzijoxik puch, xa ewal uwach ilol re, b'isol re.

Sentence 9:

xchiqelesaj rumal maja b'i chik kiq'ij; xraj k'u kitij chik kiq'ij; xraj kitijtob'ej chik, xraj kitijtob'ej chik, Raxa Kaqulja; rox chik, Raxa Kaqulja; rox chik, xraj pu kinuk' chik ilb'al re ki' chelaj ronojel uwinaqil juyub': ri kej, tz'ikin; rumal Tz'aqol, B'itol, Alom, K'ajolom; xutzininaq chik ronojel ruk' xkib'an chik chi lolinik, ma xnawachir wi k'ut, xa remanik ja', xa li' anik palo, xa utukel ri Tz' aqol, B'itol, rnawi mixutzinik, mawi mixixch' awik.

Sentence 10:

Lib'aj chi' 8 xwinaqirik: k' o wi.

コードは下記の通り。

MarkovChain.rb

def paragraphSplit(text, words)
  temp = ""
  input = ""
  open(text) do |f|
    f.each do |line|
     temp << line
    end
  end
  input << temp
  sentence = input.scan(/[^.?!]*./)
  sentence.each do |s|
    fragments = s.split
    count = 0
    fragments.each do |word|
     if count == 0 then
       word = "%START%" + word
     end   
     words.push(word)
     count += 1
    end
  end
end

def writeASentence(markov, newSentence)
  count = 0
  suffix = ""
  newSentence = ""

  while count < 100
    if newSentence == "" then
      startCandidates = []
      candidatesCount = 0
      markov.each do |a, b, c|
        if a.include?("%START%") then
          startCandidates << [a, b, c]
          candidatesCount += 1
        end    
      end  
    
      r = rand(candidatesCount)
      a = startCandidates[r][0]
      b = startCandidates[r][1]
      c = startCandidates[r][2]
      newSentence = a + " " + b + " " + c
      suffix = c
      count += 1 
    else
      rowCount = 0
      candidates = []
      markov.each do |a, b, c|
        if suffix == a then;
          candidates << [a, b, c]
          rowCount += 1
        end      
      end
    
      r = rand(rowCount)
      b = candidates[r][1]
      c = candidates[r][2]
      newSentence += " " + b + " " + c
      suffix = c
      count += 1
    end
    
    if suffix.include?("%END%") then
      newSentence.gsub!("%START%", "")
      newSentence.gsub!("%END%", "")
      return newSentence
      break
    end
  end
end

def markovDic(words, markov)
  unless words.size < 3
    for i in 0..words.size - 2 do 
      next if words[i].include?(".") or words[i].include?("?")
  
      if words[i+2] == nil or words[i+1].include?(".") or words[i+1].include?("?") then 
        markov << [words[i], words[i+1], "%END%"] 
      elsif words[i+2].include?(".") or words[i+2].include?("?") then
        markov << [words[i], words[i+1], words[i+2] + "%END%"] 
      else
        markov << [words[i], words[i+1], words[i+2]] 
      end
    end
  end
end

def paragraphSplit(text, words)

temp = ""

input = ""

open(text) do |f|

f.each do |line|

temp << line

end

input << temp

sentence = input.scan(/[^.?!]*./)

sentence.each do |s|

fragments = s.split

count = 0

fragments.each do |word|

if count == 0 then

word = "%START%" + word

end

words.push(word)

count += 1

end

def writeASentence(markov, newSentence)

count = 0

suffix = ""

newSentence = ""

while count < 100

if newSentence == "" then

startCandidates = []

candidatesCount = 0

markov.each do |a, b, c|

if a.include?("%START%") then

startCandidates << [a, b, c]

candidatesCount += 1

end

r = rand(candidatesCount)

a = startCandidates[r][0]

b = startCandidates[r][1]

c = startCandidates[r][2]

newSentence = a + " " + b + " " + c

suffix = c

count += 1

else

rowCount = 0

candidates = []

markov.each do |a, b, c|

if suffix == a then;

candidates << [a, b, c]

rowCount += 1

end

r = rand(rowCount)

b = candidates[r][1]

c = candidates[r][2]

newSentence += " " + b + " " + c

suffix = c

count += 1

end

if suffix.include?("%END%") then

newSentence.gsub!("%START%", "")

newSentence.gsub!("%END%", "")

return newSentence

break

end

def markovDic(words, markov)

unless words.size < 3

for i in 0..words.size - 2 do

next if words[i].include?(".") or words[i].include?("?")

if words[i+2] == nil or words[i+1].include?(".") or words[i+1].include?("?") then

markov << [words[i], words[i+1], "%END%"]

elsif words[i+2].include?(".") or words[i+2].include?("?") then

markov << [words[i], words[i+1], words[i+2] + "%END%"]

else

markov << [words[i], words[i+1], words[i+2]]

end

MarkovChain.rb

require '.\MarkovChain'

words = []
markov = []
#paragraphSplit('./ALMG.txt', words)
#paragraphSplit('./MINEDUC.txt', words)
#paragraphSplit('./MSKiche.txt', words)
#paragraphSplit('./AjPilipsib4.txt', words)
paragraphSplit('./PopWuj.txt', words)
markovDic(words, markov)

count = 1
while count <=10 do
  sentence = writeASentence(markov, sentence)
  puts ("Sentence " + count.to_s + ": ")
  puts sentence
  puts""
  count += 1
end

require '.\MarkovChain'

words = []

markov = []

#paragraphSplit('./ALMG.txt', words)

#paragraphSplit('./MINEDUC.txt', words)

#paragraphSplit('./MSKiche.txt', words)

#paragraphSplit('./AjPilipsib4.txt', words)

paragraphSplit('./PopWuj.txt', words)

markovDic(words, markov)

count = 1

while count <=10 do

sentence = writeASentence(markov, sentence)

puts ("Sentence " + count.to_s + ": ")

puts sentence

puts""

count += 1

end

Rubyにおける文字列分割方法：split & scan

Posted on 2017-05-31 by 13 No'j

前回Rubyでマルコフ連鎖を用いた文章作成アルゴリズムを書いてみたけど、その際に文章の分割方法に少し苦労したので備忘録として。

英語やスペイン語みたいに単語と単語が空白で分かれている場合のケースを想定。単純に単語毎に分割したい場合は.split(sentence.splitみたいに)を使えばいいけど、今回は一文毎に分割する方法について書いてみる。

例としてフランツ・カフカの「変身(Metamorphosis)」の第二段落を使用（？も含んでいるため）。文章毎に分割するということなので区切り文字としてピリオドとクエスチョンマークを採用。

最初は単純にsplitを使ってこういう風に書いてみた。

M1 = Metamorphosis.split(/[.?]/)

1	M1 = Metamorphosis.split(/[.?]/)

結果はこうなる。

-----Part 1-----
What's happened to me
-----Part 2-----
 he thought
-----Part 3-----
  It wasn't a dream
-----Part 4-----
  His room, a proper human room although a little too small, lay peacefully between its four familiar walls
-----Part 5-----
  A collection of textile samples lay spread out on the table - Samsa was a travelling salesman - and
above it there hung a picture that he had recently cut out of an illustrated magazine and housed in a nice, gilded frame
-----Part 6-----
  It showed a lady fitted out with a fur hat and fur boa who sat upright, raising a heavy fur muff that covered the whole of her lower arm
towards the viewer

-----Part 1-----

What's happened to me

-----Part 2-----

he thought

-----Part 3-----

It wasn't a dream

-----Part 4-----

His room, a proper human room although a little too small, lay peacefully between its four familiar walls

-----Part 5-----

A collection of textile samples lay spread out on the table - Samsa was a travelling salesman - and

above it there hung a picture that he had recently cut out of an illustrated magazine and housed in a nice, gilded frame

-----Part 6-----

It showed a lady fitted out with a fur hat and fur boa who sat upright, raising a heavy fur muff that covered the whole of her lower arm

towards the viewer

しっかり分けられているけど、区切り文字が取り除かれてしまっているのでこれでは使えない。次に試したのが同じsplitメソッドだけど少し修正を加えたもの。具体的にはブラケットの丸括弧(parenthesis)で囲う。

M2 = Metamorphosis.split(/([.?])/)

1	M2 = Metamorphosis.split(/([.?])/)

今回はこういう風に出力された。

-----Part 1-----
What's happened to me
-----Part 2-----
?
-----Part 3-----
 he thought
-----Part 4-----
.
-----Part 5-----
  It wasn't a dream
-----Part 6-----
.
-----Part 7-----
  His room, a proper human room although a little too small, lay peacefully between its four familiar walls
-----Part 8-----
.
-----Part 9-----
  A collection of textile samples lay spread out on the table - Samsa was a travelling salesman - and
above it there hung a picture that he had recently cut out of an illustrated magazine and housed in a nice, gilded frame
-----Part 10-----
.
-----Part 11-----
  It showed a lady fitted out with a fur hat and fur boa who sat upright, raising a heavy fur muff that covered the whole of her lower arm
towards the viewer
-----Part 12-----
.

-----Part 1-----

What's happened to me

-----Part 2-----

-----Part 3-----

he thought

-----Part 4-----

-----Part 5-----

It wasn't a dream

-----Part 6-----

-----Part 7-----

His room, a proper human room although a little too small, lay peacefully between its four familiar walls

-----Part 8-----

-----Part 9-----

A collection of textile samples lay spread out on the table - Samsa was a travelling salesman - and

above it there hung a picture that he had recently cut out of an illustrated magazine and housed in a nice, gilded frame

-----Part 10-----

-----Part 11-----

It showed a lady fitted out with a fur hat and fur boa who sat upright, raising a heavy fur muff that covered the whole of her lower arm

towards the viewer

-----Part 12-----

今回は区切り文字もしっかり含まれているけれど、区切り文字自体も一つの独立した部分として戻された。マルコフ連鎖アルゴリズムでは区切り文字は勿論文章の単語にくっつく形でなければならないのでこれも不十分。

で色々調べた結果、splitではなくscanメソッドを使えることに気付いた。試行錯誤した後、こう書いてみた。

M3 = Metamorphosis.scan(/[^.?]*./)

1	M3 = Metamorphosis.scan(/[^.?]*./)

簡単に説明するとブラケット（[^]）内の文字を除いた文字を全てスキャンし、その省いた文字を区切り文字として利用する仕様。こうすると下記の様な結果を得られる。

-----Part 1-----
What's happened to me?
-----Part 2-----
 he thought.
-----Part 3-----
  It wasn't a dream.
-----Part 4-----
  His room, a proper human room although a little too small, lay peacefully between its four familiar walls.
-----Part 5-----
  A collection of textile samples lay spread out on the table - Samsa was a travelling salesman - and
above it there hung a picture that he had recently cut out of an illustrated magazine and housed in a nice, gilded frame.
-----Part 6-----
  It showed a lady fitted out with a fur hat and fur boa who sat upright, raising a heavy fur muff that covered the whole of her lower arm
towards the viewer.

-----Part 1-----

What's happened to me?

-----Part 2-----

he thought.

-----Part 3-----

It wasn't a dream.

-----Part 4-----

His room, a proper human room although a little too small, lay peacefully between its four familiar walls.

-----Part 5-----

A collection of textile samples lay spread out on the table - Samsa was a travelling salesman - and

above it there hung a picture that he had recently cut out of an illustrated magazine and housed in a nice, gilded frame.

-----Part 6-----

It showed a lady fitted out with a fur hat and fur boa who sat upright, raising a heavy fur muff that covered the whole of her lower arm

towards the viewer.

取り敢えず、これで必要な結果は得られた。なお、この方法だとどうしても二文目以降の文頭に空白が出来てしまう。これはstripメソッドで解決。

M3.strip-----Part 1-----
What's happened to me?
-----Part 2-----
he thought.
-----Part 3-----
It wasn't a dream.
-----Part 4-----
His room, a proper human room although a little too small, lay peacefully between its four familiar walls.
-----Part 5-----
A collection of textile samples lay spread out on the table - Samsa was a travelling salesman - and
above it there hung a picture that he had recently cut out of an illustrated magazine and housed in a nice, gilded frame.
-----Part 6-----
It showed a lady fitted out with a fur hat and fur boa who sat upright, raising a heavy fur muff that covered the whole of her lower arm
towards the viewer.
What's happened to me? he thought.  It wasn't a dream.  His room, a proper human room although a little too small, lay peacefully between its four familiar walls.  A collec                  tion of textile samples lay spread out on the table - Samsa was a travelling salesman - and
above it there hung a picture that he had recently cut out of an illustrated magazine and housed in a nice, gilded frame.  It showed a lady fitted out with a fur hat and fu                  r boa who sat upright, raising a heavy fur muff that covered the whole of her lower arm
towards the viewer.!

M3.strip-----Part 1-----

What's happened to me?

-----Part 2-----

he thought.

-----Part 3-----

It wasn't a dream.

-----Part 4-----

His room, a proper human room although a little too small, lay peacefully between its four familiar walls.

-----Part 5-----

A collection of textile samples lay spread out on the table - Samsa was a travelling salesman - and

above it there hung a picture that he had recently cut out of an illustrated magazine and housed in a nice, gilded frame.

-----Part 6-----

It showed a lady fitted out with a fur hat and fur boa who sat upright, raising a heavy fur muff that covered the whole of her lower arm

towards the viewer.

What's happened to me? he thought. It wasn't a dream. His room, a proper human room although a little too small, lay peacefully between its four familiar walls. A collec tion of textile samples lay spread out on the table - Samsa was a travelling salesman - and

above it there hung a picture that he had recently cut out of an illustrated magazine and housed in a nice, gilded frame. It showed a lady fitted out with a fur hat and fu r boa who sat upright, raising a heavy fur muff that covered the whole of her lower arm

towards the viewer.!

空白がしっかり取り除かれている。入力されたテキストを文に分割することが出来たのでマルコフ連鎖による文章作成の準備が整った！

一応、参考まで今回のコードはこんな感じ。

##Split and Scan Test
##Metamorphosis retrieved from: http://www.gutenberg.org/cache/epub/5200/pg5200.txt

def displayParts(sentences)
  count = 1
  sentences.each do |parts|
    puts "-----Part " + count.to_s + "-----" 
	parts.strip! ##Remove unnecessary spaces
    puts parts
    count += 1
  end
end

Metamorphosis = ""
open('.\Metamorphosis.txt') do |t|
  t.each do |line|
   Metamorphosis << line
  end
end

M1 = Metamorphosis.split(/[.?]/) ## DON'T return delimiters
displayParts(M1)

M2 = Metamorphosis.split(/([.?])/) ## RETURN delimiters
displayParts(M2)

M3 = Metamorphosis.scan(/[^.?]*./)## RETURN delimiters ATTACHED TO words
displayParts(M3)

##Split and Scan Test

##Metamorphosis retrieved from: http://www.gutenberg.org/cache/epub/5200/pg5200.txt

def displayParts(sentences)

count = 1

sentences.each do |parts|

puts "-----Part " + count.to_s + "-----"

parts.strip! ##Remove unnecessary spaces

puts parts

count += 1

end

Metamorphosis = ""

open('.\Metamorphosis.txt') do |t|

t.each do |line|

Metamorphosis << line

end

M1 = Metamorphosis.split(/[.?]/) ## DON'T return delimiters

displayParts(M1)

M2 = Metamorphosis.split(/([.?])/) ## RETURN delimiters

displayParts(M2)

M3 = Metamorphosis.scan(/[^.?]*./)## RETURN delimiters ATTACHED TO words

displayParts(M3)

Kotz’i’jに恋して１３：マルコフ連鎖

Posted on 2017-05-31 by 13 No'j

前回述べた通り、マルコフ連鎖による文章作成のコードを書いてみた。辞書ファイルの保存を言語に関係無く容易に出来る様にするため、統計分析で用いることの多い二次元配列にした。

辞書のサイズが今後増えることを考えればツリー構造にすべきだろうけど、取り敢えず今は単純な２次元配列。

もう出し尽くされた感のあるテーマだけどコードは下の通り。

def paragraphSplit(text, words)
  temp = ""
  input = ""
  open(text) do |f|
    f.each do |line|
     temp << line
    end
  end
  input << temp
  sentence = input.scan(/[^.?]*./)
  sentence.each do |s|
    fragments = s.split
    count = 0
    fragments.each do |word|
     if count == 0 then
       word = "%START%" + word
     end   
     words.push(word)
     count += 1
    end
  end
end

def writeASentence(markov, newSentence)
  count = 0
  suffix = ""
  newSentence = ""

  while count < 100
    if newSentence == "" then
      startCandidates = []
      candidatesCount = 0
      markov.each do |a, b, c|
        if a.include?("%START%") then
          startCandidates << [a, b, c]
          candidatesCount += 1
        end    
      end  
    
      r = rand(candidatesCount)
      a = startCandidates[r][0]
      b = startCandidates[r][1]
      c = startCandidates[r][2]
      newSentence = a + " " + b + " " + c
      suffix = c
      count += 1 
    else
      rowCount = 0
      candidates = []
      markov.each do |a, b, c|
        if suffix == a then;
          candidates << [a, b, c]
          rowCount += 1
        end      
      end
    
      r = rand(rowCount)
      b = candidates[r][1]
      c = candidates[r][2]
      newSentence += " " + b + " " + c
      suffix = c
      count += 1
    end
    
    if suffix.include?("%END%") then
      newSentence.gsub!("%START%", "")
      newSentence.gsub!("%END%", "")
      return newSentence
      break
    end
  end
end



words = []
paragraphSplit('./NoOyesRulfo.txt', words)

markov = []
unless words.size < 3
  for i in 0..words.size - 2 do 
    next if words[i].include?(".") or words[i].include?("?")
  
    if words[i+2] == nil or words[i+1].include?(".") or words[i+1].include?("?") then 
      markov << [words[i], words[i+1], "%END%"] 
    elsif words[i+2].include?(".") or words[i+2].include?("?") then
      markov << [words[i], words[i+1], words[i+2] + "%END%"] 
    else
      markov << [words[i], words[i+1], words[i+2]] 
    end
  end
end

puts "-----let's generate sentences-----"

count = 1
while count <=10 do
  sentence = writeASentence(markov, sentence)
  puts ("No. " + count.to_s + ": ")
  puts sentence
  puts""
  count += 1
end

100

101

102

103

def paragraphSplit(text, words)

temp = ""

input = ""

open(text) do |f|

f.each do |line|

temp << line

end

input << temp

sentence = input.scan(/[^.?]*./)

sentence.each do |s|

fragments = s.split

count = 0

fragments.each do |word|

if count == 0 then

word = "%START%" + word

end

words.push(word)

count += 1

end

def writeASentence(markov, newSentence)

count = 0

suffix = ""

newSentence = ""

while count < 100

if newSentence == "" then

startCandidates = []

candidatesCount = 0

markov.each do |a, b, c|

if a.include?("%START%") then

startCandidates << [a, b, c]

candidatesCount += 1

end

r = rand(candidatesCount)

a = startCandidates[r][0]

b = startCandidates[r][1]

c = startCandidates[r][2]

newSentence = a + " " + b + " " + c

suffix = c

count += 1

else

rowCount = 0

candidates = []

markov.each do |a, b, c|

if suffix == a then;

candidates << [a, b, c]

rowCount += 1

end

r = rand(rowCount)

b = candidates[r][1]

c = candidates[r][2]

newSentence += " " + b + " " + c

suffix = c

count += 1

end

if suffix.include?("%END%") then

newSentence.gsub!("%START%", "")

newSentence.gsub!("%END%", "")

return newSentence

break

end

words = []

paragraphSplit('./NoOyesRulfo.txt', words)

markov = []

unless words.size < 3

for i in 0..words.size - 2 do

next if words[i].include?(".") or words[i].include?("?")

if words[i+2] == nil or words[i+1].include?(".") or words[i+1].include?("?") then

markov << [words[i], words[i+1], "%END%"]

elsif words[i+2].include?(".") or words[i+2].include?("?") then

markov << [words[i], words[i+1], words[i+2] + "%END%"]

else

markov << [words[i], words[i+1], words[i+2]]

end

puts "-----let's generate sentences-----"

count = 1

while count <=10 do

sentence = writeASentence(markov, sentence)

puts ("No. " + count.to_s + ": ")

puts sentence

puts""

count += 1

end

物凄く単純。これはキチェ語に限らず単語間を空白で話す言語であれば何語にも使える。例としてJuan RulfoのNo oyes ladrar a los perrosを辞書に取り入れてみた。出力結果はこんな感じ。

英語で書かれた「老人と海」を採り入れた場合の出力例。

英語とスペイン語で色んな文章を採り入れて試した結果、ある程度しっかりとしたボット・プログラム、つまり回答をマルコフ連鎖は使い物にならないということ。ベースとしてはマルコフ連鎖は使えるけどある程度文法を考慮した仕様にしたり、キーワードで返答を作成するのであれば、文章の構築方法も頭から書き始める以外の選択肢も必要。それが分かっただけでも収穫かな。

取り敢えず、このコードをTzijonikに取り入れてから改良していこうと思う。

13 No'j

グアテマラ、マヤ文明とプログラミング/Mayan World and Programming

Author Archives: 13 No'j

Kotz’i’jに恋して16

Kotz’i’jに恋して15：マルコフ連鎖(マヤ・キチェ語その２)

Kotz’i’jに恋して14：マルコフ連鎖(マヤ・キチェ語)

Rubyにおける文字列分割方法：split & scan

Kotz’i’jに恋して１３：マルコフ連鎖