#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('pylab', 'inline') import pandas as pd import numpy as np from __future__ import division import itertools import matplotlib.pyplot as plt import seaborn as sns import logging logger = logging.getLogger() # 3 Finding Similar Items # =================== # Problem: # + "similar": finding sets with a relatively large intersection. # - shingling # - minhashing # + too many pairs of items to test # - locality-sentitive hashing # # # ### 3.1 Applications of Near-Neighbor Search # # The *Jaccard Similarity* of sets $S$ and $T$: # \begin{equation} # SIM(S,T) = \frac{| S \cup T |}{| S \cap T |} # \end{equation} # # ##### Similar-sets Problem # 1. Similar Documents # In many applicatons, the documents are not identical, yet they share large portions of their text. # eg: Plagiarism, Mirror Pages, Articles from the Same Source. # 2. Collaborative Filtering(Similar Tastes) # eg: # + Online Purchases # + Movie Ratings # When data is not binary: # - Ignore low-rated customer/movie pairs # - "liked" VS "hated" # - 1-to-5-starts, put it in a set $n$ times. <= _Jaccard similarity for bags_ # # ##### Exercises # In[2]: #Exercise 3.1.1 logger.setLevel(logging.WARNING) data = [ set([1, 2, 3, 4]), set([2, 3, 5, 7]), set([2, 4, 6]), ] def Jaccard_similarity_calc(set_a, set_b): """calculate the Jaccard similarity of two sets res = \frac{a \cap b}{a \cup b} """ assert isinstance(set_a, set), '{} is not a set'.format(set_a) assert isinstance(set_b, set), '{} is not a set'.format(set_b) logging.debug('a:{}, b:{}'.format(set_a, set_b)) logging.debug('inter:{}, union:{}'.format(set_a.intersection(set_b), set_a.union(set_b))) return len(set_a.intersection(set_b)) / len(set_a.union(set_b)) for comb in list(itertools.combinations(range(3),2)): set_a, set_b = data[comb[0]], data[comb[1]] print('a:{}, b:{}, SIM:{:.3f}'.format(set_a, set_b, Jaccard_similarity_calc(set_a, set_b))) # In[3]: #Exercise 3.1.2 logger.setLevel(logging.WARNING) data = [ [1, 1, 1, 2], [1, 1, 2, 2, 3], [1, 2, 3, 4], ] def Jaccard_bag_similarity_calc(a, b): """calculate the Jaccard bag similarity of two sets see page 76, movie ratings 3. intersecton = min(times of element in the two sets) union = sum of length of the two sets res = intersection / union """ from collections import Counter count_a = Counter(a) count_b = Counter(b) logging.debug('count_a:{}\n count_b:{}'.format(count_a,count_b)) inter = [min(count_a[x], count_b[x]) for x in count_a if x in count_b] logging.debug('intersection:{}'.format(inter)) return sum(inter) / (len(a) + len(b)) for comb in list(itertools.combinations(range(3),2)): set_a, set_b = data[comb[0]], data[comb[1]] print('a:{}, b:{}, JbSIM:{:.3f}'.format(set_a, set_b, Jaccard_bag_similarity_calc(set_a, set_b))) # _exercise 3.1.3_ # # 引理: # 定义 # # \begin{equation} # I\{A\} = \begin{cases} # 1 & \text{if $A$ occurs.} \\ # 0 & \text{otherwise.} # \end{cases} # \end{equation} # # 令 $X_A = I\{A\}$,则有 $E[X_A] = \sum I\{A\}P[A] = P[A]$ # # # 令 $x_i$ 表示 S 和 T 中有 $i$ 个相同元素,则: # # \begin{align} # E[x_i] & = P[\text{S 和 T 中有 $i$ 个相同元素}] \\ # & = \frac{C_n^i \, C_{n-i}^{2(m-i)}}{C_n^{m} C_n^{m}} # \end{align} # # 所以 # # \begin{align} # E[J] & = \sum_{i=0}^m E[x_i] \, J_i \\ # & = \frac{\sum_{i=0}^m \, C_n^i \, C_{n-i}^{2(m-i)} \, \frac{i}{2(m-i)}}{C_n^{m} C_n^{m}} # \end{align} # # # `#todo 推导最终式` # # ### 3.2 Shingling of Documents # # **k-shingle**: any substring of length $k$ found within the document. # + variation: set -> bag # + multiple white-space => one white-space # + choosing k # - **large enough** that prob. of **any given shingle** appearing in **any given document** is **low**. # k 越大,对相似性要求越高 # - estimate the number of k-shingles as $20^k$, instead of 27 char, 因为各个字符的出现频率并非一致 # - in general, # email: k = 5; # large documents, such as research articles: k = 9 # + stop words, such as "and", "you", "to" # - ignore them in many application # - **"stop words + two words"** used as shingle in finding similar **news articles**. # # **hashing Shingles**: h(shingle) = bucket # + 9-shingles, then hash to 4 char > 4-shingles # In[4]: logger.setLevel(logging.WARNING) def to_alphanumeric_string(s): """remove any non-alphanumeric characte in string s. """ import re s = s.lower() s = re.sub('\W', ' ', s) logging.debug('after: del special char: {}'.format(s)) s = re.sub('\s+', ' ', s).strip() logging.debug('after: del multiple whitespace: {}'.format(s)) return s def k_shingle_extrac(document, k): """extract all k-shingles from document. """ document = to_alphanumeric_string(document) assert len(document) >= k, 'k should be less than the length of document' k_shingles = [] while len(document) >= k: shingle = document[0:k] if shingle not in k_shingles: k_shingles.append(shingle) document = document[1:] return k_shingles print('Example 3.3: {}'.format(k_shingle_extrac('abcdabd', 2))) print('other example: {}'.format(k_shingle_extrac("Hello, I'm So-So. ", 2))) # ##### exercise # In[5]: #exercise 3.2.1 data = "The most effective way to represent documents as sets, for the purpose of iden- tifying lexically similar documents is to construct from the document the set of short strings that appear within it. " print('first 10 3-shingles:{}'.format(k_shingle_extrac(data,3)[0:10])) # In[6]: #exercise 3.2.2 logger.setLevel(logging.WARNING) def stopwords_shingles_extract(document, k): """extract the stopwords-shingle. stropwords-shingle = stop-words + k words """ stop_words_list = ['the', 'you', 'to', 'as', 'for', 'of', 'is', 'that', 'it', 'from'] document = to_alphanumeric_string(document) document = document.split() logging.debug('split:{}'.format(document)) shingles = [] k = k + 1 #len(shingle) = 1 stop-word + k words while(document): try: logging.debug('check:{}'.format(document[0])) if document[0] in stop_words_list: shingle = ' '.join(document[0:k]) logging.debug('hit: {}'.format(shingle)) if shingle not in shingles: shingles.append(shingle) except IndexError: logging.debug('Index Error: no of char:{}, k: {}'.format(len(document), k)) k = len(document) continue document = document[1:] return shingles print('stop-shingles:{}'.format(stopwords_shingles_extract(data, 2))) # `#exercise 3.2.3` # # k-shingles 最大数量:所有提取的 shingle 均不同 # 故从左向右依次取片,有: # the largest number = n - (k - 1) = n - k + 1 # # ### 3.3 Similarity-Preserving Summaries of Sets # # ##### Matrix Representation of Sets: # (r,c) = 1: the element for row r is a member of the set for column c. # (r,c) = 0: otherwise # # + matrices are almost always **sparse** # In[7]: #Example 3.6 logger.setLevel(logging.WARNING) my_database = ['a', 'b', 'c', 'd', 'e'] my_features_dict = { 'S1':['a', 'd'], 'S2':['c'], 'S3':['b', 'd', 'e'], 'S4':['a', 'c', 'd'] } def matrix_representation_create(database, features_dict): """create the matrix representation of one database. """ matrix_ = np.zeros((len(database), len(features_dict)), dtype=np.int) matrix = pd.DataFrame(matrix_, index=database, columns=sorted(features_dict.keys())) for feature_name, values in features_dict.iteritems(): for value in values: matrix.loc[value, feature_name] = 1 return matrix my_matrix = matrix_representation_create(my_database, my_features_dict) my_matrix # ##### Minhashing # # _target_: sets of shingles are large =====> much smaller representations called "**signatures**" # # **minhash** # 1. pick a permutation of the rows # 2. h(columu c) = the row which has the first "1" # In[8]: #Example 3.7 logger.setLevel(logging.WARNING) def minhash(matrix, row_orders): """calculate the minhash value of matrix according to element permutation in Fig 3.3. """ hash_fun_names = ['h{}'.format(i) for i in range(1, len(row_orders)+1)] hash_table = pd.DataFrame(np.zeros((len(row_orders), matrix.shape[1])), index=hash_fun_names, columns=matrix.columns) for row_order, hash_fun_name in zip(row_orders, hash_fun_names): matrix_p = matrix.loc[row_order,:] logging.debug('after permutation: \n{}'.format(matrix_p)) for c in matrix_p.columns: first_one_index = next((i for i, x in enumerate(matrix_p.loc[:,c]) if x), None) hash_table.loc[hash_fun_name, c] = row_order[first_one_index] return hash_table minhash(my_matrix, [['b', 'e', 'a', 'd', 'c']]) # **minhash and Jaccard Similarity**: $P[h(S1) = h(S2)] = SIM(S1, S2)$ # # *Proof*: # rows can be divided into three classes: # # + Type X rows have 1 in both columns (means both S1 and S2 have these features) # + Type Y rows have 1 in either S1 or S2 # + Type Z rows have 0 in both columns (means both S1 and S2 don't have them) # # \begin{align} # | S1 \cap S2 | & = len(X) \\ # | S1 \cap S2 | & = len(X+Y) \\ # SIM(S1, S2) &= \frac{len(X)}{len(X+Y)} # \end{align} # # randomly permute rows, there is at least one colunm whose value is 1: # $$ P[h(S1) = h(S2)] = \frac{len(x)}{len(X+Y)} $$ # # **minhash signature**: # use vector $[h_1(S), h_2(S), ... , h_n(S)]$ to represent matrix M # namely, reduce the dim: columns(M) -----> n # # **Computing Minhash Signatures** # picking a random permutation of large rows and find its hash value both are **time-consuming** ==> simulation # In[9]: logger.setLevel(logging.WARNING) my_matrix.index = range(my_matrix.shape[0]) print('matrix:\n{}\n'.format(my_matrix)) h_rows = [ [1, 2, 3, 4, 0], [1, 4, 2, 0, 3] ] def minhash_by_rows(matrix, row_orders): """calculate the minhash value of matrix according to row_permutation in Fig 3.4. """ hash_fun_names = ['h{}'.format(i) for i in range(1, len(row_orders)+1)] hash_table = pd.DataFrame(np.zeros((len(row_orders), matrix.shape[1])), index=hash_fun_names, columns=matrix.columns) for row_order, hash_fun_name in zip(row_orders, hash_fun_names): logging.debug('row_order:{}, h:{}'.format(row_order, hash_fun_name)) matrix_p = matrix.copy() matrix_p.index = row_order #new rows permutation matrix_p.sort_index(inplace=True) #adjust rows orders logging.debug('after permutation: \n{}'.format(matrix_p)) for c in matrix_p.columns: first_one_index = next((i for i, x in enumerate(matrix_p.loc[:,c]) if x), None) hash_table.loc[hash_fun_name, c] = first_one_index return hash_table my_minhash_res = minhash_by_rows(my_matrix, h_rows) print('minhash: 5dim -> 2dim \n\ true hash res:\n{}\n'.format(my_minhash_res)) print('Minhashing') for comb in list(itertools.combinations(range(4),2)): s_a, s_b = 'S{}'.format(comb[0]+1), 'S{}'.format(comb[1]+1) print('{}-{}'.format(s_a, s_b)), set_a, set_b = set(my_minhash_res.iloc[:,comb[0]]), set(my_minhash_res.iloc[:,comb[1]]) print('minhash:{:.3f}'.format(Jaccard_similarity_calc(set_a, set_b))), print('true SIM:{:.3f}'.format(Jaccard_similarity_calc(set(my_features_dict[s_a]), set(my_features_dict[s_b])))) # 对 $M$ 中的所有行做一次随机组合得到 $\bar{M}$,非常耗时。 # 解决方法是不直接操作矩阵 $M$,而是分两步操作: # # 1) 选择合适的哈希函数生成行号序列来模拟随机排列结果,如上个代码区块中 `h_rows` 即可用 $h_1(x) = (x + 1) \% 5$ 和 $h_2(x) = (3x + 1) \% 5$ 生成对应的行号 # In[10]: def add_hash_func(a, b, c): return lambda x: (a*x + b) % c h_funcs = [ add_hash_func(1, 1, 5), add_hash_func(3, 1, 5) ] h_rows = [] for h_func in h_funcs: h_rows.append(map(h_func, range(5))) print('h_rows:{}'.format(h_rows)) # 2) 再按照行号重新对行排序,生成新矩阵 $\bar{M}$ # 但第 2 步排序仍然需要操作矩阵,见函数 `minhash_by_rows` 中的语句 `matrix_p.sort_index(inplace=True)`。 # In[11]: #Fig 3.4 df_matrix = my_matrix df_h_rows = pd.DataFrame(np.array(h_rows).T) df_h_rows.columns = ['h{}'.format(x+1) for x in df_h_rows.columns] print('Hash functions computed for the matrix:\n{}\n'.format(pd.concat([df_matrix, df_h_rows], axis=1))) print('signature matrix\n(SIG):\n{}'.format(my_minhash_res)) # Let SIG(i,c) be the element of the signature matrix for the ith hash function and column c. # # 替代方法是借助思路:$ SIG(i,c) = min(h_i(S_c=1)) $ # 即对于 $h_i(S_c)$ 值来说,它只可能是 “1” 对应的行号,而第一个 “1” 肯定是最小的行号。 # # 比如,我们想得到 SIG(1,1) 的值, # $S1 = [1, 0, 0, 1, 0]^T$,对应的 $h1 = [1, 2, 3, 4, 0]^T$, # SIG(1,1) 只能取 “1” 的行号,即 1 或 4, # 又 SIG 是第一个遇到的 “1”,所以 SIG(1,1) 肯定是 1。 # # **思路总结**: # 我们对每个 SIG(i,c) 都可以通过遍历 $S_i$ 中 "1" 位置,取出 $h_c$ 中相应位置的行号,行号中最小的值就是解。 # # 而这个思路是可以并行化处理的,有两种方法: # # 1. 列遍历:即从左至右遍历 $S_i$ 得到 "1" 位置时,我们可以据此取出所有 $h_c$ 相应的行号,分别取最小。 # 2. 行遍历:书中 Example3.8 例子,从上往下遍历,逐行比较。 # 步骤如下: # 1. Compute $h_1(r), h_2(r), ... , h_n(r)$. # 2. For each column $c$ do the following: # + If $c$ has $0$ in row $r$, do nothing # + else, if $c$ has $1$ in row $r$, # for each $i = 1,2, ..., n$, # $SIG(i,c) = min \left( SIG(i,c), \, h_i(r) \right)$ # # 行遍历的方法,粗略想,计算量应该更少。 # `#todo: 验证计算量` # In[12]: #Example 3.8 logger.setLevel(logging.WARNING) def minhash_signatures_calc(df_M, hash_funcs, nagging=False): """computing minhash signatures by the way in Example 3.8. """ logging.debug('data matrix:\n{}\n'.format(df_M)) h = [] for hash_func in hash_funcs: h.append(map(hash_func, range(df_M.shape[0]))) df_h = pd.DataFrame(np.array(h).T) df_h.columns = ['h{}'.format(x+1) for x in df_h.columns] logging.debug('hash matrix:\n{}\n'.format(df_h)) if nagging: print('hash matrix:\n{}\n'.format(pd.concat([df_matrix, df_h], axis=1))) df_signatures = pd.DataFrame(np.ones((df_h.shape[1], df_M.shape[1]))*np.inf, index=df_h.columns, columns=df_M.columns) logging.debug('signatures matrix:\ninit\n{}\n'.format(df_signatures)) for r in df_M.index: for c in df_h.columns: r_1_loc = df_M.loc[r,:] == 1 logging.debug('r:{}, c:{}, 1 loc:\n{}\n'.format(r,c, r_1_loc)) sig_c = df_signatures.loc[c,:] line_bigger_loc = sig_c > df_h.loc[r, c] logging.debug('bigger row loc:\n{}\n'.format(line_bigger_loc)) sig_c[line_bigger_loc & r_1_loc] = df_h.loc[r, c] logging.debug('modified:\n{}\n'.format(sig_c)) df_signatures.loc[c,:] = sig_c if nagging: print('row:{},\n signature matrix:\n{}\n'.format(r, df_signatures)) return df_signatures minhash_signatures_calc(df_matrix, h_funcs, nagging=True) # ##### exercise 3.3 # In[13]: #exercise 3.3.1 #generate 120 permutations h_rows = list(itertools.permutations(range(5),5)) my_minhash_res = minhash_by_rows(my_matrix, h_rows) for comb in list(itertools.combinations(range(4),2)): s_a, s_b = 'S{}'.format(comb[0]+1), 'S{}'.format(comb[1]+1) print('{}-{}'.format(s_a, s_b)), #calc Jaccard similarity print('true SIM:{:.3f}'.format(Jaccard_similarity_calc(set(my_features_dict[s_a]), set(my_features_dict[s_b])))), #calc the fraction of the 120 permutations in which the value is same print('fraction:{:.3f}'.format(sum(my_minhash_res.loc[:,s_a] == my_minhash_res.loc[:,s_b])/120)) # In[14]: #exercise 3.3.2 h_funcs[2:4] = [ add_hash_func(2, 4, 5), add_hash_func(3, -1, 5) ] minhash_signatures_calc(df_matrix, h_funcs, nagging=True) # In[15]: #exercise 3.3.3 my_database = range(6) my_features_dict = { 'S1':[2, 5], 'S2':[0, 1], 'S3':[3, 4], 'S4':[0, 2, 4] } df_fig_3_5 = matrix_representation_create(my_database, my_features_dict) print('Fig 3.5:\n{}\n'.format(df_fig_3_5)) #(a) h_funcs = [ add_hash_func(2, 1, 6), add_hash_func(3, 2, 6), add_hash_func(5, 2, 6) ] df_matrix = df_fig_3_5 my_minhash_res = minhash_signatures_calc(df_matrix, h_funcs, nagging=True) my_minhash_res # In[16]: #(b) h_3 is a true permutation. #(c) for comb in list(itertools.combinations(range(4),2)): s_a, s_b = 'S{}'.format(comb[0]+1), 'S{}'.format(comb[1]+1) print('{}-{}'.format(s_a, s_b)), set_a, set_b = set(my_minhash_res.iloc[:,comb[0]]), set(my_minhash_res.iloc[:,comb[1]]) print('minhash:{:.3f}'.format(Jaccard_similarity_calc(set_a, set_b))), print('true SIM:{:.3f}'.format(Jaccard_similarity_calc(set(my_features_dict[s_a]), set(my_features_dict[s_b])))) # `#exercise 3.3.4` # 不是很明白意思,大概想法是: # 从 $n$ 中抽取 $m$ 个元素集合 $S$ 和 $T$,则从 $S$ 和 $T$ 中各抽一次,元素相同的概率是多少? # # `#exercise 3.3.5` # 元素相同的概率是 0 # ### 3.4 Locality-Sensitive Hashing for Documents # # minhash 降低了维度,但计算耗时问题仍然存在: # 令样本集有 $k$ 个元素,两两比较次数是 $C_k^2 = O(k^2)$。 # 若 $k$ 很大,则比较次数成乘方倍增长。 # # 解决思路: # 1. 并行处理 # 2. 只比较可能相似的样本 # often we want only the most similar pairs or all paris that are above some lower bound in similarity. # ----> Locality-sensitive hasing (LSH) or near-neighbor search. # # **LSH**: # "hash" items several times, in such a way: $$P[\text{similar items in the same bucket}] >> P[\text{dissimilar ...}]$$ # # candidate pair: any pair hashed to the same bucket. # false postive: dissimilar items in the candidate pair. # false negtive: similar items NOT in the cnadidate pair. # # if we have minhash signatures for the items, an effective way to choose the hashings is: # 1. divide the signature matrix into $b$ bands consisting of $r$ rows each. # 2. for each band $b_i$, hash each vector in $b_i$ # hash function $h$: same vectors -> same bucket # # 也就是说,将两个样本向量分割,只要有一个节点哈希到一起,就认为是候选对。 # # 注意,统计的是每个节点各自哈希的情况,所以每个节点使用相同的哈希函数,但使用独立的哈希表存储中间结果。如下图: # # matrix | --> | hash table # :-------:|:------:|:------------: # band 1 | h(v) | hash_talbe_1 # band 2 | h(v) | hash_talbe_2 # .... | h(v) | .... # # # **Analysis** the probabilty of two pairs to become a candidate pairs: # Given the pairs $s_a$ and $s_b$ have Jaccard similarity $s$, we use $b$ bands of $r$ row each to split their signature matrix $M_a$ and $M_b$. # 1. $\forall \, i^{th} \, \text{row}, \, prob_{rowY} = P[M_a[i,:] == M_b[i,:]] = \text{Jaccard similarity} = s$ # 2. the possibility that one band($r$ row) is the same is: $$prob_{bandY} = prob_{rowY}^r = s^r$$ # .......................................not the same is: $$prob_{bandN} = 1 - prob_{bandY} = 1 - s^r$$ # 3. $s_a$ and $s_b$ are totally different: $$prob_{pairN} = prob_{bandN}^b = (1 - s^r)^b$$ # $s_a$ and $_b$ **at least agree in one band**: $$prob_{pairY} = 1 - prob_{pairN} = 1 - (1 - s^r)^b$$ # # # **threshold**: the value of $s$ at which the probability of becoming a candidate is 1/2. # namely, $1 - (1 - s^r)^b = \frac{1}{2}$, the solution $s \approx (1/b)^{1/r}$ # In[17]: #Example 3.11 b = 20 r = 5 s = np.linspace(0, 1, 100) def p(s, r, b): return 1 - (1 - s**r)**b plt.plot(s,p(s, r, b)) s = np.arange(0.2, 0.9, 0.1) zip(s, p(s, r, b)) # if we consider two documents with 80% similarity, then their false negative is $1 - 0.99964... \approx 0.00035$, which means that only roughly 1 in 3000 pairs that are as high as 80% similar will fail to become a candidate pair. # ##### Combining the Techniques # 整合前面讲过的方法,找到相似文本的方法如下: # # 1. k-shingles # [opt] hash k-shignles to shorter bucket numbers # # 2. Sort the document-shigles pairs # 不用生成完成的矩阵,而是依据 shingles 顺序后面用到时逐行生成(节省内存) # # 3. Pick n, compute the minhash signatures by feed the sorted list line by line # 逐行遍历 # # 4. Choose a threshold $t$ # $t \approx (1/b)^{1/r}$ and $n = b \times n$ to determine appropriate $b, n$. # # 5. Construct candidate pairs using LSH # # 6. Examine each candidate pair's signatures, $SIM > t$? # # 7. [opt] check document themselve # # `#todo: 找个例子,代码实现` # ##### exercise 3.4 # `#maybe:` # # `#exercise 3.4.3` # # `#exercise 3.4.4` # ### 3.5 Distance Measures # # **LSH** 本身的意义: # 就是为一个特定的距离 $l$,找到合适的哈希函数 $h$ 来实现 "S" 曲线的效果: # 即此距离下越近,哈希到同一区块的可能性越高。 # # 这章介绍距离,为下一章找哈希函数打基础。 # # ##### 距离的定义 # # 定义集合 $X$ 上的度量為一函數距离 $d: X \times X \to \mathbf{R}$ # 对于 $\forall x, y, z \in X$,应满足: # # 1. 正定性 # $d(x, y) \ge 0$ # $d(x, y) = 0$ if and only if $x = y$ # # 2. 交换性 # $d(x, y) = d(y, x)$ # # 3. 三角不等式 # $d(x, y) \le d(x, z) + d(z, y)$ # # # ##### 几种距离 # 1. Euclidean Distance: $\| \cdot \|_2$ # $L_r\text{-norm} = (\sum \| \cdot \|_1^r)^{1/r}$ # $L_\infty\text{-norm} = max(\| \cdot \|_1)$ # # 2. Jaccard Distance: $d(x, y) = 1 - SIM(x, y)$ # # 3. Cosine Distance: # 向量夹角 # $d(x, y) = arc cos\{ \frac{x \cdot y}{\|x\|_2 + \|y\|_2} \}$ # # 4. Edit Distance: # 1. 直接计算 # Given $x = x_1 x_2 ...$, $y = y_1 y_2 ...$. # $d(x, y)$ is the smallest number of insertion and deletion of single char($x \to y$). # 有[动态规划算法](http://blog.csdn.net/xanxus46/article/details/38678335), # # \begin{equation} # \operatorname{lev}_{a,b}(i,j) = \begin{cases} # \max(i,j) & \text{ if} \min(i,j)=0, \\ # \min \begin{cases} # \operatorname{lev}_{a,b}(i-1,j) + 1 \\ # \operatorname{lev}_{a,b}(i,j-1) + 1 \\ # \operatorname{lev}_{a,b}(i-1,j-1) + 1_{(a_i \neq b_j)} # \end{cases} &\text{ otherwise.} # \end{cases} # \end{equation} # # 具体内容见维基[Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance)。 # # 2. 间接计算 # $d(x, y) = len(x) + len(y) - 2 LCS(x, y)$ # LCS is the longest common subsequence of $x$ and $y$. # # 5. Hamming Distance: $d(x, y) = sum( \, diff(x, y) \,)$ # # # ### 3.6 The Theory of Locality-Sensitive Function # # 对于每种距离定义,需要找到合适的哈希函数使其满足 $P-s$ 是 "S" 型曲线(越相似,概率越高)。 # # a family of functions $\{h_i(x)\}$ should statify 3 conditions: # # 1) $P[\text{close pairs become candidate pairs}] > P[\text{dissimilar pairs...}]$ # 补充: "close" - distance measure, "P" 是概率。 这里就说明 "Prob(x, y) - d(x, y)" 应该是 ”S" 型曲线。 # eg: Probility using minhash to Jaccard distance is "S" curve. # In[18]: s = np.linspace(0, 1, 100) plt.plot(1-s, p(s, r, b)) plt.xlabel('Jaccard distance') plt.ylabel('Probility') # 2) $h_i(x)$ should be statistically independent, because we need estimate the possbility by using the product rule for independent events. # eg: $s^r$ in $1 - (1 - s^r)^b$ for Jaccard similarity. # # 3) efficient in two ways: # + identify candidate pairs in time much less than the time it takes to look at all pairs. # + They must be combinable to build functions that are better at avoiding false positives and negatives, and the combined functions must also take time that is much less than the number of pairs. # 当我们使用 “band, then hash" 这种方法来组合哈希函数达到概率 $1 - (1 - s^r)^b$ 变成 "S" 曲线(其实是倒”S"型,如上图), # 所以我们需要找到的哈希方法,它应该有将距离接近的候选对更高概率匹配的特性,即 $P-d(x,y)$ 是倒 ”S"。这就引出了接下的内容:local sensitive. # # **$(d_1, d_2, p_1, p_2)$-sensitive**: # 1. if $d(x, y) \leq d_1$, then $P[f(x) = f(y)] \geq p_1$. # 2. if $d(x, y) \geq d_2$, then $P[f(x) = f(y)] \leq p_2$. # # eg: For Jaccard distance, the family of minhash functinos is a $(d_1, d_2, 1-d_1, 1-d_2)$-sensitive family for any $d_1$ and $d_2$ where $0 \leq d_1 < d_2 \leq 1$. # In[19]: #minhash for Jaccard distance d = 1 - s d_1, d_2 = 0.4, 0.6 plt.plot(d[dd_2], p(s, r, b)[d>d_2]) plt.xlabel('d(x, y)') plt.ylabel('Probility of being a candidate') # **Amplifing a Locality-Sensitive Family** # Given a $(d_1, d_2, p_1, p_2)$-sensitive family $F = {f_1, ..., f_r}$, we can construct a new family $F'$: # 1. **AND**-construction: # $f(x) = f(y)$ when **all** $f_i(x) = f_i(y)$ # $$(d_1, d_2, p_1^r, p_2^r)$$ # # 2. **OR**-construction: # $f(x) = f(y)$ when **any** $f_i(x) = f_i(y)$ # $$(d_1, d_2, 1-(1-p_1)^r, 1-(1-p_2)^r)$$ # # In[20]: #AND-construciton and OR-constuction dis = d pos = p(s, r, b) r = 4 AND_pos = (pos**r) OR_pos = 1 - (1 - pos)**r plt.plot(dis, pos, label='origin') plt.plot(dis, AND_pos, label='AND-cons') plt.plot(dis, OR_cons, label='OR-cons') plt.legend(loc='upper right') plt.xlabel('d(x, y)') plt.ylabel('P[f(x) = f(y)]') # As shown in the figure above, # + AND-construction: low probability close to 0, namely, false negative$\uparrow$, false postive$\downarrow$. # # + OR-construction: high probabiity close to 1, namely, false negative$\downarrow$, false postive$\uparrow$. # # 用 AND- 和 OR- 可以组合成需要的 $(d_1, d_2, p_1, p_2)$。 # **LSH families for Distance measures** # 1. For **Jaccard distance**, # the family of **minhash functinos** is a $(d_1, d_2, 1-d_1, 1-d_2)$-sensitive family for any $d_1$ and $d_2$ where $0 \leq d_1 < d_2 \leq 1$. # # 2. For **Hamming Distance**, # given d-dim vector $x$ adn $y$, their hamming distance is $h(x, y)$. # define: $f_i(x)$ to be the $i_{th}$ bit of $x$, # then for a randomly chosen $i$, $P[f_i(x) = f_i(y)] = 1 - h(x,y)/d$. # # $F={f_i(x)}$ is $(d_1, d_2, 1 - d_1/d, 1 - d_2/d)$-sensitive for any $d_1 < d_2$ # + $h(x, y) \in [0, d]$ # + $max \, |F| = d$, # 函数个数有限,若 $d$ 特别小,会限制了使用 AND- 和 OR- 组合的可能性。 # # 4. For **Euclidean Distance**, # $f$ is associated with a randomly chosen line which is divided into segments of length $a$, and the segments of the line are the buckets into which function $f$ hashed points. as shown in Fig (3.13). # $(a/2, 2a, 1/2, 1/3)$ # # 3. For **Cosine Distance**, # $x$ 和 $y$ 的夹角是 $h(x,y$. # 任选一超平面 $S_i$,取其法向量 $v_i$, # 则当 $S_i$ 位于夹角内时,$v_i \cdot x$ 和 $v_i \cdot y$ 符号不同;夹角外时,符号相同。具体见书中图 3.12。 # # 定义: # $f(x) = f(y)$ 有且仅有 $v_f \cdot x$ 和 $v_f \cdot y$ 符号相同。 # # $(d_1, d_2, (180-d_1)/180, (180-d_2)/180)$ # # + 感觉这个函数计算量,未必比直接计算余弦距离小 # **sketches**: # 有随机向量集合 $V = \{v_1, ..., v_n\}$,其中 $\forall \, v_i \in \{+1, -1\}^d$ # 则 sketches of $x$ is # # \begin{align} # sx & = [sx_1, ..., sx_n] \\ # & = [I(v_1 \cdot x), ..., I(v_n \cdot x)] # \end{align} # # 其中 # # \begin{equation} # I(x) = \begin{cases} # +1, & \quad x > 0 \\ # -1, & \quad x < 0 \\ # +1, \, \text{or} \, -1 & \quad x = 0 # \end{cases} # \end{equation} # # 比较两个向量 $x$ 和 $y$ 的相似性,可转化为 $$P[f(x) = f(y)] = \frac{len(\text{intersection}(sx, sy))}{len(sx)}$$ # In[21]: #example 3.21 x = np.arange(3, 7) y = np.arange(4, 0, -1) print('x:{}, y:{}'.format(x,y)) from scipy.spatial.distance import cosine print('angle between x and y: {:.3f}\n'.format(np.rad2deg(np.arccos(1-cosine(x,y))))) def sketch_calc(x, v): sketch = np.dot(x, v) sketch[sketch>0] = 1 sketch[sketch<=0] = -1 return sketch v = [[1, -1, 1, 1], [-1, 1, -1, 1], [1, 1, -1, -1]] v = np.array(v).T print('v:{}'.format(v)) x_s = sketch_calc(x, v) print('sketch of x: {}'.format(x_s)) y_s = sketch_calc(y, v) print('sketch of y: {}'.format(y_s)) print('p = same elemets / total elements: {} / {}'. format(sum((x_s-y_s)==0), len(x_s))) def sketch_angle_calc(x_s, y_s): """calculate angle when similar possibily of two vectors are determined. p = (180 - d) / 180 """ p = sum((x_s-y_s)==0) / len(x_s) d = 180 - 180 * p return d print('angle(sketch): {}\n'.format(sketch_angle_calc(x_s, y_s))) v = list(itertools.product([-1,1],repeat=len(x))) v = np.array(v).T print('v:{}'.format(v)) x_s = sketch_calc(x, v) print('sketch of x: {}'.format(x_s)) y_s = sketch_calc(y, v) print('sketch of y: {}'.format(y_s)) print('p = same elemets / total elements: {} / {}'. format(sum((x_s-y_s)==0), len(x_s))) print('angle(sketch): {}'.format(sketch_angle_calc(x_s, y_s))) # ### 3.8 Applications of Locality-Sensitive Hashing # ##### 1. Entity Resolution # to match data records that refer to the same real-world entity. # 找到同一对象在不同数据库里的数据 # # eg: A 和 B 公司对账(记录方式不尽相同) # 1. 相似度比较: # 使用"姓名“,”电话“,”地址信息“,三个分别用编辑距离计算,评分(d=0-1, score=100-0),累积总值就是相似度评分 $s$。 # + certain publicly available tables were used to reduce the penalty in appropriate situations, like "Bill" and "William" are treated as 1 letter. # # 2. 全部比较太耗时,所以用 LSH 先找出潜在配对。 # 对三组分别哈希,用 AND- 构建。 # 实际中的方法是直接排序:名字,电话,地址均按升序排列,相似的就聚焦在一起。 # # 3. 评价匹配正确率 $f$。 # 使用其它未使用的记录来做为评据。 # 如这里使用两个数据库配对项目的创建时间差 $t$ 做为度量: # 1. $\bar{t} = E[\text{random_pick}(t)]$: 随机配对的时间差期望 # 2. $t_p = E[t_{xy}: argmax_{x,y} s(x,y)]$: 最大 $s$ 数据集的时间差期望(假设全是正确匹配,$f=1$)。 # 3. 假设现在用 $s_i$ 做阈值,得到的匹配对算出均值 $t_i$,其正确匹配率是 $f_i$, # 则有 $t_i = t_p * f_i + \bar{t} * (1 - f_i)$ # 解得: # $$f_i = \frac{\bar{t} - t_i}{\bar{t} - t_p}$$ # # 即,我们可以用结果中的其他数据项,组成评价匹配正确率的函数。 # # # ##### 2. Matching Fingerprints # 特征描述: minutiae 构建指纹的特征向量。 # # 两类问题: # 1. 一对多:找出某个指纹的匹配 # 2. 多对多:找出潜在相同的匹配 # # 假设随机抽取一个指纹的特征向量的一位,其中有 minutiae 的概率是 $P[x] 20\%$。 # 同一来源的两个指纹在特征向量的一位,一个有 minutiae 时,另一个也有的概率是 $P[Y|X] = 80\%$。 # 定义**哈希函数** $f$ 为: # 抽取特征向量的三位,若两个指纹的每一位均含有 minutiae,则做为候选对。 # # 分析:随机对候选的概率是 $0.2^6 = 0.000064$, 同一来源候选的概率是 $0.2^3 \times 0.8^3 = 0.004096$, # # 我们生成 1024 个 $f_i$ 哈希函数,使用 OR- 构建,再用 AND- 构建,结果见下图。 # In[22]: #Example 3.22 p = np.array([0.2**6, (0.2**3)*(0.8**3)]) print('dissimilar:{:.4f}, similar:{:.4f}'.format(p[0], p[1])) #1024 OR-cons p = 1 - (1 - p)**1024 print('dissimilar:{:.4f}, similar:{:.4f}'.format(p[0], p[1])) #then 2 AND-cons p = p**2 print('dissimilar:{:.4f}, similar:{:.4f}'.format(p[0], p[1])) print 1/p[0] # ##### 3. Matching Newspaper Articles # difference of text in prose and in ads/headlines: # Prose has a much greater frequency of stop words ====> define a _shingle_ to be a stop word followd by the next two words. # ### 3.9 Methods for High Degrees of Similarity # # **LSH-based methods** appear most **effective** when the degree of similarity we accept is **relatively low**. # # ##### 1. Finding Identical Items # methods: # 1. hash the first few char and compare. # + cons: some doc has the same header, such as html. # 2. hash the whole document. # + cons: examine every char of every doc. # 3. pick some fixed random pos, then hash and compare. # + prob: exclude the docs whose length are different. # # ##### 2. Finding highly similar items # all pairs in the sets have a high Jaccard similarity, say at least 0.9. # # **representing** sets as strings: # represent a set by **sorting** the elements of the universal set in some **fixed order** and constructing a string of "characters". # + No char appears > 1 time # + if 2 char in 2 different strings, they are in the same order in both string. # + For documents, the element is shingle. # # eg: 26 lower-case letters, normal alphabetical order: {d, a, b, b} => 'abd' # # 1. Length-Based Filtering: sort the strings by length # each string $s$ is compared with those strings $t$ that follow $s$ in the list ($L_s \leq L_t: L_x = len(x)$). # $J_u$ is the upper bound on Jaccard distance between two strings. # # SIM(s, t) is at most $L_s/L_t$, and lower bound = $1 - L_s/L_t < J_u$ = upper bound. # hence, $$L_t \leq \frac{L_s}{1 - J_u}$$. # # 2. Prefix Indexing: create an index for the firs $p$-symbols of $s$. # $p$: make sure that at least 1 shared symbol in two strings when $SIM(s, t) \geq J$ # # 从相反情况讨论,假设 $SIM(s, t) \geq J$ 但前 $p$ 字符无一相同,则此时 $max \, SIM(s, t) = (L_s - p) / L_s$。那么只要 $J > (L_s - p) / L_s$ 则能保证前 $p$ 字符至少有一个相同。有: # $$p = \lfloor (1 - J)L_s \rfloor + 1$$ # # 1. Using Position Information: 使用位置信息来判定是否需要比较 # 索引是 $(x, j)$ # 若 $s[1:i-1] \cap t[1:i-1] = \emptyset$ 并且 $s[i:-1] = t[j:-1]$ # 则其 $$SIM(s, t) = \frac{L_s-(i-1)}{L_s+(j-1)} \geq J$$ # 可解得 $$j \leq (L_s(1 - J) - i + 1 + J)/J$$ # # 2. Using Position and Length in Indexs: # 索引是 (x, j, suffix length). # 若有 (s_i, i, p) 和 (t_j, j, q),前面部分无一相同,后面部分两者是子集关系。 # 则分情况讨论: # + 若 $p \geq q$,则 $$\frac{L_s - (i-1) - (p-q)}{L_s + j - 1} = \frac{q + 1}{L_s + j - 1} \geq J$$ # + 若 $p < q$, 则 $$\frac{L_s - (i-1)}{L_s + (j-1) + (q-p)} = \frac{L_s - i + 1}{i + j - 1 + q} \geq J$$ # # # # In[ ]: