In [1]:
# Code attribution: Yiyin Shen, Tyler Caraza-Harter
# Imports
import numpy
In [2]:
# Simulate the bigram model for Groot
rng = numpy.random.default_rng(seed = 42)
token = [" I", " am", " Groot", "."]
tr = numpy.array([[0.1, 0.7, 0.1, 0.1], [0.1, 0.1, 0.7, 0.1], [0.1, 0.1, 0.1, 0.7], [0.7, 0.1, 0.1, 0.1]])
seq = [rng.choice(4)]
text = [token[seq[0]]]
for i in range(1000):
seq.append(rng.choice(4, p = tr[seq[i], :]))
text.append(token[seq[i + 1]])
"".join(text[:50])
Out[2]:
' I am Groot. I. am Groot am Groot.. I Groot. I am I Groot. am Groot. Groot. I am I am Groot.. I am Groot am am Groot Groot. I Groot. I Groot. I am Groot am'
In [3]:
# Estimate the transitions again
tr_hat = numpy.array([[0] * 4] * 4)
for i in range(999):
tr_hat[seq[i], seq[i + 1]] += 1
tr_hat = tr_hat / numpy.sum(tr_hat, axis = 1).reshape(-1, 1)
tr_hat
Out[3]:
array([[0.09053498, 0.71604938, 0.10699588, 0.08641975], [0.12260536, 0.09578544, 0.67816092, 0.10344828], [0.08333333, 0.13492063, 0.09920635, 0.68253968], [0.69135802, 0.11522634, 0.09876543, 0.09465021]])
In [4]:
# Count the number of time "I am Groot" appeared
count = 0
for i in range(998):
if seq[i] == 0 and seq[i + 1] == 1 and seq[i + 2] == 2:
count += 1
count
Out[4]:
120