#関数を定義する
def add(x, y):
  return x + y

print(add(1, 2))

#繰り返し文と条件分岐
xs = [-3,0,3]
for x in xs:
  if x > 0:
    print('正の値です。', x)
  elif x == 0:
    print('0です。', x)
  else:
    print('負の値です。', x)

#リストの要素数を取得する。
len(['aaa', 'bbb', 'ccc'])

# ※文字列も文字のリストとして扱える。
len('abc')

#要素の重複がないリストを作る。
set(['a', 'b', 'c', 'a'])

#要素を順番に並べる。
sorted(['b', 'a', 'c'])

#リストを結合する。
['a', 'b'] + ['c', 'd']

#リストを繰り返す。
['a', 'b', 'c'] * 2

# ※文字列も繰り返せる。
'abc' * 2

#要素から値を取得する。
x = ['a', 'b', 'c', 'd', 'e']
x[1]

#要素から値を取得する。（〜以上〜未満）
x[1:3]

#要素から値を取得する。（〜以上）
x[1:]

#要素から値を取得する。（〜以下）
x[:3]

# 文字列にも同じ指定ができる。
'abcde'[1:3]

#要素から値を取得する。（後ろから〜以上）
x[-2:]

#要素から値を取得する。（後ろから〜未満）
x[:-2]

#リスト→文字列
'|'.join(['a','b','c'])

#文字列→リスト
'a|b|c'.split('|')

[x.upper() for x in ['dddd', 'bb', 'ccc', 'a' ,'eeeee'] if len(x) > 2]

import nltk

from nltk.book import *

from nltk.text import Text

#Plotのフォントで、日本語フォントを指定する。
import matplotlib
import matplotlib.font_manager as font_manager
font_path = '/usr/share/fonts/truetype/fonts-japanese-gothic.ttf'
font_prop = font_manager.FontProperties(fname = font_path)
matplotlib.rcParams['font.family'] = font_prop.get_name() #font_prop.get_name()でフォント名を文字列指定するなら、前2行は不要

import nltk
from nltk.corpus.reader import *
from nltk.corpus.reader.util import *
from nltk.probability import *
from nltk.tokenize.api import *
from nltk.text import Text

#MeCabでの文書解析器
class JPMeCabTokenizer(TokenizerI):
    def __init__(self):
        import MeCab
        self.mecab = MeCab.Tagger('-Owakati')

    def tokenize(self, text):
        result = self.mecab.parse(text)
        return result.strip().split(' ')

jp_sent_tokenizer = nltk.RegexpTokenizer('[^　「」！？。]*[！？。]')

#NLTKパッケージ（ex.コーパス）をダウンロードする。
nltk.download()

#文書内から指定単語が使われている文を探す。
text1.concordance("単語")

#文書内から指定単語と同じ文脈で使われている文を探す。
text1.similar("単語")

#文書内から指定単語（複数）と同じ文脈で使われている文を探す。
text1.common_contexts(["単語1", "単語2"])

#文書内の単語を取得
reader = PlaintextCorpusReader(
    "/home/owner/data/小説", r'A_NKMK_4099483.txt',
    para_block_reader=read_line_block,
    sent_tokenizer=jp_sent_tokenizer,
    word_tokenizer=JPMeCabTokenizer())
words = reader.words()
texts = Text(reader.words())
#頻度分布を作る
fdist = FreqDist(words)

#Plotで日本語を使う為の設定
import matplotlib
import matplotlib.font_manager as font_manager
font_path = '/usr/share/fonts/truetype/fonts-japanese-gothic.ttf'
font_prop = font_manager.FontProperties(fname = font_path)
matplotlib.rcParams['font.family'] = font_prop.get_name() #font_prop.get_name()でフォント名を文字列指定するなら、前2行は不要

#描画(単語の累積比率)
fdist.plot(30, cumulative = True)

#描画(単語の出現位置)
from nltk.draw.dispersion import dispersion_plot
dispersion_plot(texts, ['ちゃん', 'さん', '君'])