首页>代码>springboot2.0+lucene简单demo,实现高亮,全文索引。>/lucene-master/src/main/java/com/xywy/lucene/core/AnalyzeContext.java
001 | /** |
002 | * IK 中文分词 版本 5.0 |
003 | * IK Analyzer release 5.0 |
004 | * |
005 | * Licensed to the Apache Software Foundation (ASF) under one or more |
006 | * contributor license agreements. See the NOTICE file distributed with |
007 | * this work for additional information regarding copyright ownership. |
008 | * The ASF licenses this file to You under the Apache License, Version 2.0 |
009 | * (the "License"); you may not use this file except in compliance with |
010 | * the License. You may obtain a copy of the License at |
011 | * |
013 | * |
014 | * Unless required by applicable law or agreed to in writing, software |
015 | * distributed under the License is distributed on an "AS IS" BASIS, |
016 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
017 | * See the License for the specific language governing permissions and |
018 | * limitations under the License. |
019 | * |
020 | * 源代码由林良益(linliangyi2005@gmail.com)提供 |
021 | * 版权声明 2012,乌龙茶工作室 |
022 | * provided by Linliangyi and copyright 2012 by Oolong studio |
023 | * |
024 | */ |
025 | package com.xywy.lucene.core; |
026 |
027 | import com.xywy.lucene.cfg.Configuration; |
028 |
029 | import java.io.IOException; |
030 | import java.io.Reader; |
031 | import com.xywy.lucene.dic.Dictionary; |
032 |
033 | import java.util.*; |
034 |
035 | /** |
036 | * |
037 | * 分词器上下文状态 |
038 | * |
039 | */ |
040 | class AnalyzeContext { |
041 |
042 | // 默认缓冲区大小 |
043 | private static final int BUFF_SIZE = 4096 ; |
044 | // 缓冲区耗尽的临界值 |
045 | private static final int BUFF_EXHAUST_CRITICAL = 100 ; |
046 |
047 | // 字符窜读取缓冲 |
048 | private char [] segmentBuff; |
049 | // 字符类型数组 |
050 | private int [] charTypes; |
051 |
052 | // 记录Reader内已分析的字串总长度 |
053 | // 在分多段分析词元时,该变量累计当前的segmentBuff相对于reader起始位置的位移 |
054 | private int buffOffset; |
055 | // 当前缓冲区位置指针 |
056 | private int cursor; |
057 | // 最近一次读入的,可处理的字串长度 |
058 | private int available; |
059 |
060 | // 子分词器锁 |
061 | // 该集合非空,说明有子分词器在占用segmentBuff |
062 | private Set<String> buffLocker; |
063 |
064 | // 原始分词结果集合,未经歧义处理 |
065 | private QuickSortSet orgLexemes; |
066 | // LexemePath位置索引表 |
067 | private Map<Integer, LexemePath> pathMap; |
068 | // 最终分词结果集 |
069 | private LinkedList<Lexeme> results; |
070 |
071 | // 分词器配置项 |
072 | private Configuration cfg; |
073 |
074 | public AnalyzeContext(Configuration cfg) { |
075 | this .cfg = cfg; |
076 | this .segmentBuff = new char [BUFF_SIZE]; |
077 | this .charTypes = new int [BUFF_SIZE]; |
078 | this .buffLocker = new HashSet<String>(); |
079 | this .orgLexemes = new QuickSortSet(); |
080 | this .pathMap = new HashMap<Integer, LexemePath>(); |
081 | this .results = new LinkedList<Lexeme>(); |
082 | } |
083 |
084 | int getCursor() { |
085 | return this .cursor; |
086 | } |
087 |
088 | // |
089 | // void setCursor(int cursor){ |
090 | // this.cursor = cursor; |
091 | // } |
092 |
093 | char [] getSegmentBuff() { |
094 | return this .segmentBuff; |
095 | } |
096 |
097 | char getCurrentChar() { |
098 | return this .segmentBuff[ this .cursor]; |
099 | } |
100 |
101 | int getCurrentCharType() { |
102 | return this .charTypes[ this .cursor]; |
103 | } |
104 |
105 | int getBufferOffset() { |
106 | return this .buffOffset; |
107 | } |
108 |
109 | /** |
110 | * 根据context的上下文情况,填充segmentBuff |
111 | * @param reader |
112 | * @return 返回待分析的(有效的)字串长度 |
113 | * @throws IOException |
114 | */ |
115 | int fillBuffer(Reader reader) throws IOException { |
116 | int readCount = 0 ; |
117 | if ( this .buffOffset == 0 ) { |
118 | // 首次读取reader |
119 | readCount = reader.read(segmentBuff); |
120 | } else { |
121 | int offset = this .available - this .cursor; |
122 | if (offset > 0 ) { |
123 | // 最近一次读取的>最近一次处理的,将未处理的字串拷贝到segmentBuff头部 |
124 | System.arraycopy( this .segmentBuff, this .cursor, this .segmentBuff, 0 , offset); |
125 | readCount = offset; |
126 | } |
127 | // 继续读取reader ,以onceReadIn - onceAnalyzed为起始位置,继续填充segmentBuff剩余的部分 |
128 | readCount += reader.read( this .segmentBuff, offset, BUFF_SIZE - offset); |
129 | } |
130 | // 记录最后一次从Reader中读入的可用字符长度 |
131 | this .available = readCount; |
132 | // 重置当前指针 |
133 | this .cursor = 0 ; |
134 | return readCount; |
135 | } |
136 |
137 | /** |
138 | * 初始化buff指针,处理第一个字符 |
139 | */ |
140 | void initCursor() { |
141 | this .cursor = 0 ; |
142 | this .segmentBuff[ this .cursor] = CharacterUtil.regularize( this .segmentBuff[ this .cursor]); |
143 | this .charTypes[ this .cursor] = CharacterUtil.identifyCharType( this .segmentBuff[ this .cursor]); |
144 | } |
145 |
146 | /** |
147 | * 指针+1 |
148 | * 成功返回 true; 指针已经到了buff尾部,不能前进,返回false |
149 | * 并处理当前字符 |
150 | */ |
151 | boolean moveCursor() { |
152 | if ( this .cursor < this .available - 1 ) { |
153 | this .cursor++; |
154 | this .segmentBuff[ this .cursor] = CharacterUtil.regularize( this .segmentBuff[ this .cursor]); |
155 | this .charTypes[ this .cursor] = CharacterUtil.identifyCharType( this .segmentBuff[ this .cursor]); |
156 | return true ; |
157 | } else { |
158 | return false ; |
159 | } |
160 | } |
161 |
162 | /** |
163 | * 设置当前segmentBuff为锁定状态 |
164 | * 加入占用segmentBuff的子分词器名称,表示占用segmentBuff |
165 | * @param segmenterName |
166 | */ |
167 | void lockBuffer(String segmenterName) { |
168 | this .buffLocker.add(segmenterName); |
169 | } |
170 |
171 | /** |
172 | * 移除指定的子分词器名,释放对segmentBuff的占用 |
173 | * @param segmenterName |
174 | */ |
175 | void unlockBuffer(String segmenterName) { |
176 | this .buffLocker.remove(segmenterName); |
177 | } |
178 |
179 | /** |
180 | * 只要buffLocker中存在segmenterName |
181 | * 则buffer被锁定 |
182 | * @return boolean 缓冲去是否被锁定 |
183 | */ |
184 | boolean isBufferLocked() { |
185 | return this .buffLocker.size() > 0 ; |
186 | } |
187 |
188 | /** |
189 | * 判断当前segmentBuff是否已经用完 |
190 | * 当前执针cursor移至segmentBuff末端this.available - 1 |
191 | * @return |
192 | */ |
193 | boolean isBufferConsumed() { |
194 | return this .cursor == this .available - 1 ; |
195 | } |
196 |
197 | /** |
198 | * 判断segmentBuff是否需要读取新数据 |
199 | * |
200 | * 满足一下条件时, |
201 | * 1.available == BUFF_SIZE 表示buffer满载 |
202 | * 2.buffIndex < available - 1 && buffIndex > available - BUFF_EXHAUST_CRITICAL表示当前指针处于临界区内 |
203 | * 3.!context.isBufferLocked()表示没有segmenter在占用buffer |
204 | * 要中断当前循环(buffer要进行移位,并再读取数据的操作) |
205 | * @return |
206 | */ |
207 | boolean needRefillBuffer() { |
208 | return this .available == BUFF_SIZE && this .cursor < this .available - 1 |
209 | && this .cursor > this .available - BUFF_EXHAUST_CRITICAL && ! this .isBufferLocked(); |
210 | } |
211 |
212 | /** |
213 | * 累计当前的segmentBuff相对于reader起始位置的位移 |
214 | */ |
215 | void markBufferOffset() { |
216 | this .buffOffset += this .cursor; |
217 | } |
218 |
219 | /** |
220 | * 向分词结果集添加词元 |
221 | * @param lexeme |
222 | */ |
223 | void addLexeme(Lexeme lexeme) { |
224 | this .orgLexemes.addLexeme(lexeme); |
225 | } |
226 |
227 | /** |
228 | * 添加分词结果路径 |
229 | * 路径起始位置 ---> 路径 映射表 |
230 | * @param path |
231 | */ |
232 | void addLexemePath(LexemePath path) { |
233 | if (path != null ) { |
234 | this .pathMap.put(path.getPathBegin(), path); |
235 | } |
236 | } |
237 |
238 | /** |
239 | * 返回原始分词结果 |
240 | * @return |
241 | */ |
242 | QuickSortSet getOrgLexemes() { |
243 | return this .orgLexemes; |
244 | } |
245 |
246 | /** |
247 | * 推送分词结果到结果集合 |
248 | * 1.从buff头部遍历到this.cursor已处理位置 |
249 | * 2.将map中存在的分词结果推入results |
250 | * 3.将map中不存在的CJDK字符以单字方式推入results |
251 | */ |
252 | void outputToResult() { |
253 | int index = 0 ; |
254 | for (; index <= this .cursor;) { |
255 | // 跳过非CJK字符 |
256 | if (CharacterUtil.CHAR_USELESS == this .charTypes[index]) { |
257 | index++; |
258 | continue ; |
259 | } |
260 | // 从pathMap找出对应index位置的LexemePath |
261 | LexemePath path = this .pathMap.get(index); |
262 | if (path != null ) { |
263 | // 输出LexemePath中的lexeme到results集合 |
264 | Lexeme l = path.pollFirst(); |
265 | while (l != null ) { |
266 | this .results.add(l); |
267 | // 将index移至lexeme后 |
268 | index = l.getBegin() + l.getLength(); |
269 | l = path.pollFirst(); |
270 | if (l != null ) { |
271 | // 输出path内部,词元间遗漏的单字 |
272 | for (; index < l.getBegin(); index++) { |
273 | this .outputSingleCJK(index); |
274 | } |
275 | } |
276 | } |
277 | } else { // pathMap中找不到index对应的LexemePath |
278 | // 单字输出 |
279 | this .outputSingleCJK(index); |
280 | index++; |
281 | } |
282 | } |
283 | // 清空当前的Map |
284 | this .pathMap.clear(); |
285 | } |
286 |
287 | /** |
288 | * 对CJK字符进行单字输出 |
289 | * @param index |
290 | */ |
291 | private void outputSingleCJK( int index) { |
292 | if (CharacterUtil.CHAR_CHINESE == this .charTypes[index]) { |
293 | Lexeme singleCharLexeme = new Lexeme( this .buffOffset, index, 1 , Lexeme.TYPE_CNCHAR); |
294 | this .results.add(singleCharLexeme); |
295 | } else if (CharacterUtil.CHAR_OTHER_CJK == this .charTypes[index]) { |
296 | Lexeme singleCharLexeme = new Lexeme( this .buffOffset, index, 1 , Lexeme.TYPE_OTHER_CJK); |
297 | this .results.add(singleCharLexeme); |
298 | } |
299 | } |
300 |
301 | /** |
302 | * 返回lexeme |
303 | * |
304 | * 同时处理合并 |
305 | * @return |
306 | */ |
307 | Lexeme getNextLexeme() { |
308 | // 从结果集取出,并移除第一个Lexme |
309 | Lexeme result = this .results.pollFirst(); |
310 | while (result != null ) { |
311 | // 数量词合并 |
312 | this .compound(result); |
313 | if (Dictionary.getSingleton().isStopWord( this .segmentBuff, result.getBegin(), |
314 | result.getLength())) { |
315 | // 是停止词继续取列表的下一个 |
316 | result = this .results.pollFirst(); |
317 | } else { |
318 | // 不是停止词, 生成lexeme的词元文本,输出 |
319 | result.setLexemeText(String.valueOf(segmentBuff, result.getBegin(), result.getLength())); |
320 | break ; |
321 | } |
322 | } |
323 | return result; |
324 | } |
325 |
326 | /** |
327 | * 重置分词上下文状态 |
328 | */ |
329 | void reset() { |
330 | this .buffLocker.clear(); |
331 | this .orgLexemes = new QuickSortSet(); |
332 | this .available = 0 ; |
333 | this .buffOffset = 0 ; |
334 | this .charTypes = new int [BUFF_SIZE]; |
335 | this .cursor = 0 ; |
336 | this .results.clear(); |
337 | this .segmentBuff = new char [BUFF_SIZE]; |
338 | this .pathMap.clear(); |
339 | } |
340 |
341 | /** |
342 | * 组合词元 |
343 | */ |
344 | private void compound(Lexeme result) { |
345 | if (! this .cfg.useSmart()) { |
346 | return ; |
347 | } |
348 | // 数量词合并处理 |
349 | if (! this .results.isEmpty()) { |
350 |
351 | if (Lexeme.TYPE_ARABIC == result.getLexemeType()) { |
352 | Lexeme nextLexeme = this .results.peekFirst(); |
353 | boolean appendOk = false ; |
354 | if (Lexeme.TYPE_CNUM == nextLexeme.getLexemeType()) { |
355 | // 合并英文数词+中文数词 |
356 | appendOk = result.append(nextLexeme, Lexeme.TYPE_CNUM); |
357 | } else if (Lexeme.TYPE_COUNT == nextLexeme.getLexemeType()) { |
358 | // 合并英文数词+中文量词 |
359 | appendOk = result.append(nextLexeme, Lexeme.TYPE_CQUAN); |
360 | } |
361 | if (appendOk) { |
362 | // 弹出 |
363 | this .results.pollFirst(); |
364 | } |
365 | } |
366 |
367 | // 可能存在第二轮合并 |
368 | if (Lexeme.TYPE_CNUM == result.getLexemeType() && ! this .results.isEmpty()) { |
369 | Lexeme nextLexeme = this .results.peekFirst(); |
370 | boolean appendOk = false ; |
371 | if (Lexeme.TYPE_COUNT == nextLexeme.getLexemeType()) { |
372 | // 合并中文数词+中文量词 |
373 | appendOk = result.append(nextLexeme, Lexeme.TYPE_CQUAN); |
374 | } |
375 | if (appendOk) { |
376 | // 弹出 |
377 | this .results.pollFirst(); |
378 | } |
379 | } |
380 |
381 | } |
382 | } |
383 |
384 | } |