言語処理100本ノック

06. 集合¶

"paraparaparadise"と"paragraph"に含まれる文字bi-gramの集合を,それぞれ, XとYとして求め,XとYの和集合,積集合,差集合を求めよ.さらに,'se'というbi-gramがXおよびYに含まれるかどうかを調べよ.

def n_gram(s,n):
    sentence = s.split()
    result = []
    len_sentence = len(sentence)
    for start in range(len_sentence - n + 1):
        result.append(sentence[start:start+n])
    return result

X = n_gram(" ".join([s for s in "paraparaparadise"]),2)
X = [s[0]+s[1] for s in X]
Y = n_gram(" ".join([s for s in "paragraph"]),2)
Y = [s[0]+s[1] for s in Y]
SX = set(X)
SY = set(Y)
#union
print(SX.union(SY))
#intersection
print(SX.intersection(SY))
#diff
print(SX.difference(SY))
#'se' in SX or SY
print('se' in SX)
print('se' in SY)
<結果>
{'pa', 'se', 'ra', 'di', 'ph', 'ag', 'is', 'gr', 'ar', 'ad', 'ap'}
{'pa', 'ar', 'ra', 'ap'}
{'se', 'ad', 'is', 'di'}
True
False