$ curl --head http://127.0.0.1:9800/es/steelDict
HTTP/1.1 200 OK
Etag: DefaultTags
Last-Modified: 2021-10-15 14:49:35
Date: Fri, 15 Oct 2021 07:23:15 GMT
GET 格式
- 返回词库时,Content-Length、charset=UTF-8一定要有。
- Last-Modified和Etag 只需要1个有变化即可。只有当HEAD请求返回时,这2个其中一个字段的值变了,才会发送GET请求获取内容,请注意!
- 一行代表一个词,自己追加\n换行
$ curl -i http://127.0.0.1:9800/es/steelDict
HTTP/1.1 200 OK
Content-Length: 130
Content-Type: text/html;charset=UTF-8
Etag: DefaultTags
Last-Modified: 2021-10-15 14:49:35
Date: Fri, 15 Oct 2021 07:37:47 GMT 装饰管
装饰板
圆钢
无缝管
无缝方管
卫生级无缝管
卫生级焊管
热轧中厚板
热轧平板
热轧卷平板
实现
配置ES IK分词器
# 这里以centos 7为例,通过rpm安装
$ vim /usr/share/elasticsearch/plugins/ik/config/IKAnalyzer.cfg.xml
# 改这一行,换成我们的地址
<entry key="remote_ext_dict">http://10.16.52.52:9800/es/steelDict</entry>
$ systemctl restart elasticsearch # 重启es # 这里还可以实时看到日志,比较方便
$ tail -f /var/log/elasticsearch/my-application.log
[2021-10-15T15:02:31,448][INFO ][o.w.a.d.Monitor ] [node-1] 获取远程词典成功,总数为:0
[2021-10-15T15:02:31,952][INFO ][o.e.l.LicenseService ] [node-1] license [3ca1dc7b-3722-40e5-916e-3b2093980b75] mode [basic] - valid
[2021-10-15T15:02:31,962][INFO ][o.e.g.GatewayService ] [node-1] recovered [1] indices into cluster_state
[2021-10-15T15:02:32,812][INFO ][o.e.c.r.a.AllocationService] [node-1] Cluster health status changed from [RED] to [YELLOW] (reason: [shards started [[steel-category-mapping][2]] ...]).
[2021-10-15T15:02:41,630][INFO ][o.w.a.d.Monitor ] [node-1] 重新加载词典...
[2021-10-15T15:02:41,631][INFO ][o.w.a.d.Monitor ] [node-1] try load config from /etc/elasticsearch/analysis-ik/IKAnalyzer.cfg.xml
[2021-10-15T15:02:41,631][INFO ][o.w.a.d.Monitor ] [node-1] try load config from /usr/share/elasticsearch/plugins/ik/config/IKAnalyzer.cfg.xml
[2021-10-15T15:02:41,886][INFO ][o.w.a.d.Monitor ] [node-1] [Dict Loading] http://10.16.52.52:9800/es/steelDict
[2021-10-15T15:02:43,958][INFO ][o.w.a.d.Monitor ] [node-1] 获取远程词典成功,总数为:0
[2021-10-15T15:02:43,959][INFO ][o.w.a.d.Monitor ] [node-1] 重新加载词典完毕...
Golang接口
假设使用gin框架,初始化路由:
const (
kUrlSyncESIndex = "/syncESIndex" // 同步钢材品名、材质、规格、产地、仓库到ES索引中
kUrlGetSteelHotDict = "/steelDict" // 获取钢材字典(品材规产仓)
) func InitRouter(router *gin.Engine) {
// ... esRouter := router.Group("es")
// 同一个接口,根据head/get来决定是否返回数据部,避免宽带浪费
esRouter.HEAD(kUrlGetSteelHotDict, onHttpGetSteelHotDictHead)
esRouter.GET(kUrlGetSteelHotDict, onHttpGetSteelHotDict) // ...
}
// onHttpGetSteelHotDictHead 处理head请求,只有当Last-Modified 或 ETag 其中1个值改变时,才会出发GET请求获取词库列表
func onHttpGetSteelHotDictHead(ctx *gin.Context) {
t, err := biz.QueryEsLastSyncTime()
if err != nil {
ctx.JSON(http.StatusOK, gin.H{
"code": biz.StatusError,
"msg": "server internal error",
})
logger.Warn(err)
return
}
ctx.Header("Last-Modified", t)
ctx.Header("ETag", kDefaultTags)
}
// onHttpGetSteelHotDict 处理GET请求,返回真正的词库,每一行一个词
func onHttpGetSteelHotDict(ctx *gin.Context) {
// 这里从mysql查询词库,dic是一个[]string切片
dic, err := biz.QuerySteelHotDic()
if err != nil {
ctx.JSON(http.StatusOK, gin.H{
"code": biz.StatusError,
"msg": "server internal error",
})
logger.Warn(err)
return
} // 这里查询最后一次更新时间,作为判断词库需要更新的标准
t, err := biz.QueryEsLastSyncTime()
if err != nil {
ctx.JSON(http.StatusOK, gin.H{
"code": biz.StatusError,
"msg": "server internal error",
})
logger.Warn(err)
return
} ctx.Header("Last-Modified", t)
ctx.Header("ETag", kDefaultTags) body := ""
for _, v := range dic {
if v != "" {
body += v + "\n"
}
}
logger.Infof("%s query steel dict success, count = %d", ctx.Request.URL, len(dic)) buffer := []byte(body)
ctx.Header("Content-Length", strconv.Itoa(len(buffer)))
ctx.Data(http.StatusOK, "text/html;charset=UTF-8", buffer)
}
POST http://10.0.56.153:9200/_analyze
{
"analyzer": "ik_smart",
"text": "武钢 Q235B 3*1500*3000 6780 佰隆库 在途整件出"
} {
"tokens": [
{
"token": "武钢",
"start_offset": 0,
"end_offset": 2,
"type": "CN_WORD",
"position": 0
},
{
"token": "q235b",
"start_offset": 3,
"end_offset": 8,
"type": "CN_WORD",
"position": 1
},
{
"token": "3*1500*3000",
"start_offset": 9,
"end_offset": 20,
"type": "ARABIC",
"position": 2
},
{
"token": "6780",
"start_offset": 21,
"end_offset": 25,
"type": "ARABIC",
"position": 3
},
{
"token": "佰隆库",
"start_offset": 26,
"end_offset": 29,
"type": "CN_WORD",
"position": 4
},
{
"token": "在途",
"start_offset": 30,
"end_offset": 32,
"type": "CN_WORD",
"position": 5
},
{
"token": "整件",
"start_offset": 32,
"end_offset": 34,
"type": "CN_WORD",
"position": 6
},
{
"token": "出",
"start_offset": 34,
"end_offset": 35,
"type": "CN_CHAR",
"position": 7
}
]
}
/**
* 加载远程扩展词典到主词库表
*/
private void loadRemoteExtDict() {
// ...
for (String theWord : lists) {
if (theWord != null && !"".equals(theWord.trim())) {
// 加载扩展词典数据到主内存词典中
// 注释这一行:
// logger.info(theWord);
_MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
}
}
// ...
}
mvn package
/**
* 加载远程扩展词典到主词库表
*/
private void loadRemoteExtDict() {
List<String> remoteExtDictFiles = getRemoteExtDictionarys();
for (String location : remoteExtDictFiles) {
logger.info("[Dict Loading] " + location);
List<String> lists = getRemoteWords(location);
// 如果找不到扩展的字典,则忽略
if (lists == null) {
logger.error("[Dict Loading] " + location + "加载失败");
continue;
} else {
logger.info("获取远程词典成功,总数为:" + lists.size());
}
for (String theWord : lists) {
if (theWord != null && !"".equals(theWord.trim())) {
// 加载扩展词典数据到主内存词典中
logger.info(theWord);
_MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
}
}
}
}
[2021-10-15T15:02:41,886][INFO ][o.w.a.d.Monitor] [node-1] [Dict Loading] http://10.16.52.52:9800/es/steelDict
[2021-10-15T15:02:43,958][INFO ][o.w.a.d.Monitor] [node-1] 获取远程词典成功,总数为:0
[2021-10-15T15:02:43,959][INFO ][o.w.a.d.Monitor] [node-1] 重新加载词典完毕...
public static void main(String[] args) {
List<String> words = getRemoteWordsUnprivileged("http://127.0.0.1:9800/es/steelDict");
System.out.println(words.size());
}
/**
* 英文字符及阿拉伯数字子分词器
*/
class LetterSegmenter implements ISegmenter {
// ... //链接符号(这里追加*号)
private static final char[] Letter_Connector = new char[]{'#', '&', '+', '-', '.', '@', '_', '*'};
//数字符号(这里追加*号)
private static final char[] Num_Connector = new char[]{',', '.', '*'}; // ...
}