{"id":1134,"date":"2019-08-21T01:03:28","date_gmt":"2019-08-20T16:03:28","guid":{"rendered":"https:\/\/oboki.net\/workspace\/?p=1134"},"modified":"2019-09-01T22:19:50","modified_gmt":"2019-09-01T13:19:50","slug":"elasticsearch-%ea%b2%80%ec%83%89%ec%97%94%ec%a7%84-%eb%a7%8c%eb%93%a4%ea%b8%b0-2-ims-%eb%8d%b0%ec%9d%b4%ed%84%b0-%ed%81%ac%eb%a1%a4%eb%a7%81","status":"publish","type":"post","link":"https:\/\/oboki.net\/workspace\/python\/elasticsearch-%ea%b2%80%ec%83%89%ec%97%94%ec%a7%84-%eb%a7%8c%eb%93%a4%ea%b8%b0-2-ims-%eb%8d%b0%ec%9d%b4%ed%84%b0-%ed%81%ac%eb%a1%a4%eb%a7%81\/","title":{"rendered":"[ElasticSearch] \uac80\uc0c9\uc5d4\uc9c4 \ub9cc\ub4e4\uae30 2 &#8211; IMS \ub370\uc774\ud130 \ud06c\ub864\ub9c1"},"content":{"rendered":"<h1>[ElasticSearch] \uac80\uc0c9\uc5d4\uc9c4 \ub9cc\ub4e4\uae30 2 &#8211; IMS \ub370\uc774\ud130 \ud06c\ub864\ub9c1<\/h1>\n<p><code>Selenium<\/code>, <code>BeautifulSoup<\/code> \ub97c \uc774\uc6a9\ud574\uc11c IMS \uc774\uc288 \ud398\uc774\uc9c0 Crawler \ub97c \ub9cc\ub4e0\ub2e4. \uae30\ucd08\uc801\uc778 \ubd80\ubd84\uc740 [\uc5ec\uae30]()\ub97c \ucc38\uace0.<\/p>\n<h2>\ub2e8\uc704 \ud06c\ub864\ub7ec<\/h2>\n<p>\uc218\uc9d1\ud560 \uc774\uc288 \ubc88\ud638\uc5d0 \ub300\ud55c \ub9ac\uc2a4\ud2b8\ub97c \ub9cc\ub4e4\uc5b4 \ud55c\ubc88\uc5d0 \uc218\uc9d1\ud558\uace0 Elasticsearch\uc5d0 \uc778\ub371\uc2f1 \ud558\ub294 <code>IndexIssueList<\/code> \ud568\uc218\ub97c \ub9cc\ub4e4\uc5c8\ub2e4. (IMS \ub294 \uc774\uc288 \ubc88\ud638\ub97c URL(<code><a href=\"https:\/\/ims.tmaxsoft.com\/tody\/ims\/issue\/issueView.do?issueId\">https:\/\/ims.tmaxsoft.com\/tody\/ims\/issue\/issueView.do?issueId<\/a>=<\/code>) \uc5d0 \ubcc0\uc218\ub85c \ub300\uc785\ud574\uc11c \uac01 \uc774\uc288 \ud398\uc774\uc9c0\uc5d0 \uc811\uadfc\ud560 \uc218 \uc788\ub2e4.)<\/p>\n<pre><code class=\"language-py\">_LOGIN_ID           = &quot;IMS_USERNAME&quot;\n_LOGIN_PASS         = &quot;IMS_PASSWORD&quot;\n_URL_LOGIN          = &quot;https:\/\/ims.tmaxsoft.com\/tody\/auth\/login.do&quot;\n_URL_ISSVIEW        = &quot;https:\/\/ims.tmaxsoft.com\/tody\/ims\/issue\/issueView.do?issueId=&quot;\n_SLCT_LOGIN_INPUT   = &quot;body &gt; form &gt; table &gt; tbody &gt; tr &gt; td &gt; table &gt; tbody &gt; tr:nth-child(2) &gt; td:nth-child(1) &gt; table &gt; tbody &gt; tr &gt; td:nth-child(2) &gt; table &gt; tbody &gt; tr &gt; td:nth-child(2) &gt; table &gt; tbody &gt; tr &gt; td:nth-child(3) &gt; input&quot;\n_SLCT_SUBJ          = &quot;body &gt; div:nth-of-type(2) &gt; table &gt; tbody &gt; tr &gt; td:nth-of-type(2) &gt; table &gt; tbody &gt; tr:nth-of-type(2) &gt; td &gt; table:nth-of-type(1) &gt; tbody &gt; tr &gt; td &gt; table &gt; tbody &gt; tr &gt; td:nth-of-type(2)&quot;\n_SLCT_DTLS          = &quot;#IssueDescriptionDiv &gt; table &gt; tbody &gt; tr &gt; td&quot;\n_SLCT_IINFO         = &quot;#issueInfoTable&quot;\n\ndef IndexIssueList(issueList):\n    import re\n    from selenium import webdriver\n    from bs4 import BeautifulSoup\n\n    driver = webdriver.PhantomJS(&#039;.\/webdriver\/phantomjs&#039;,\n        service_log_path=&#039;.\/logs\/ghostdriver.log&#039;)\n    driver.implicitly_wait(1)\n\n    driver.get(_URL_LOGIN)\n    driver.find_element_by_name(&#039;id&#039;).send_keys(_LOGIN_ID)\n    driver.find_element_by_name(&#039;password&#039;).send_keys(_LOGIN_PASS)\n    driver.find_element_by_css_selector(_SLCT_LOGIN_INPUT).click()\n\n    iss = []\n    for il in issueList:\n        try:\n            data = {}\n            driver.get(_URL_ISSVIEW+str(il))\n            html = driver.page_source\n            soup = BeautifulSoup(html,&#039;html.parser&#039;)\n\n            subj = soup.select(_SLCT_SUBJ)[0].text.strip()\n            data.update({&#039;Subject&#039;:subj})\n\n            iinfo = soup.select(_SLCT_IINFO)[0].findAll(&#039;tr&#039;)\n            for b in iinfo:\n                key,val = &#039;&#039;,&#039;&#039;\n                for i in b.findAll(&#039;td&#039;,attrs={&#039;class&#039;:re.compile(r&#039;title|data&#039;)}):\n                    if i[&#039;class&#039;][0] == &#039;title&#039;: key = i.text.strip()\n                    else: val = i.text.strip()\n                data.update({key:re.sub(&#039;[\\n\\t ]+&#039;,&#039; &#039;,val)})\n\n            if data[&#039;Closed Date&#039;] == &#039;&#039;:\n                data[&#039;Closed Date&#039;]=&#039;1970\/01\/01 09:00:00&#039;\n            if data[&#039;Date of final order&#039;] == &#039;&#039;:\n                data[&#039;Date of final order&#039;]=&#039;1970\/01\/01 09:00:00&#039;\n\n            dtls = soup.select(_SLCT_DTLS)\n            data.update({&#039;Issue Details&#039;:re.sub(&#039;[\\n\\t ]+&#039;,&#039; &#039;,dtls[0].text.strip())})\n\n            comments,tmp = soup.findAll(&#039;div&#039;,attrs={&#039;class&#039;:&#039;commDescTR data&#039;}),&quot;&quot;\n            for c in comments: tmp += c.text.strip()+&#039;\\n&#039;\n            data.update({&#039;Action Log&#039;:tmp})\n            iss.append(data)\n        except IndexError as error:\n            continue\n\n    driver.close()\n\n    for i in range(len(iss)):\n        iss[i].update({&#039;Issue Number&#039;:re.search(r&#039;\\d+&#039;,iss[i][&#039;Issue Number&#039;]).group()})\n\n    actions = [\n        {\n            &quot;_index&quot;: &quot;issue-v0.1.4&quot;,\n            &quot;_type&quot;: &quot;_doc&quot;,\n            &quot;_id&quot;: iss[i][&#039;Issue Number&#039;],\n            &quot;_source&quot;: iss[i]\n        }\n        for i in range(len(iss))\n    ]\n\n    from elasticsearch import Elasticsearch\n    from elasticsearch import helpers\n    es_client = Elasticsearch([&quot;localhost:9200&quot;],timeout=300)\n    res = helpers.bulk(es_client, actions)<\/code><\/pre>\n<p>\uc704 \ucf54\ub4dc\uc5d0\uc11c <code>driver.close()<\/code> \ubd80\ubd84\uae4c\uc9c0\uac00 IMS \uc6f9\uc5d0\uc11c \ub370\uc774\ud130\ub97c \uc218\uc9d1\ud558\ub294 \ubd80\ubd84\uc774\uace0, \uadf8 \uc544\ub798\uac00 elasticsearch \uc5d0 bulk indexing \uc73c\ub85c \ub370\uc774\ud130\ub97c \uc0c9\uc778\ud558\ub294 \ub2e8\uacc4\uc774\ub2e4. \uc218\uc9d1 \ub2e8\uacc4\uc5d0\uc11c\ub294 \uc218\uc9d1 \ub300\uc0c1 \uc6f9 \ud398\uc774\uc9c0\uc5d0 \ub530\ub77c \ub2e4\ub974\uac8c \uad6c\ud604\ub420 \uac83\uc774\uae30\ub54c\ubb38\uc5d0 \uadf8\ub54c\uadf8\ub54c \ub2ec\ub77c\uc9c8 \ubd80\ubd84\uc774\uc9c0\ub9cc \uc77c\ub2e8 \ud55c \ud398\uc774\uc9c0\uc758 \uc815\ubcf4\ub97c \ub515\uc154\ub108\ub9ac \ud615\ud0dc\ub85c \ub9cc\ub4e4\uace0 \uc774 \ub515\uc154\ub108\ub9ac\ub4e4\uc5d0 \ub300\ud55c \ub9ac\uc2a4\ud2b8\ub97c \ub9cc\ub4e4\uc5b4 \ub4a4\uc5d0\uc11c \uc21c\ud68c\ud560 \uc218 \uc788\ub3c4\ub85d \ud55c\ub2e4.<\/p>\n<p>\ub2e4\uc74c\uacfc \uac19\uc774 \ubc8c\ud06c \uc791\uc5c5\uc744 \ud560 \uac83\ub4e4\uc744 \ud558\ub098\uc758 \ub9ac\uc2a4\ud2b8 actions \ub85c \ub9cc\ub4e4\uc5b4\uc8fc\uba74\ub418\ub294\ub370 <code>_id<\/code> \ub294 Issue Number \uc640 \uac19\uac8c \ub9cc\ub4e4\uc5b4\uc11c \ucd94\ud6c4 \uc5c5\ub370\uc774\ud2b8 \uc791\uc5c5\uc774 \uc77c\uc5b4\ub0a0 \ub54c \ubc88\uac70\ub86d\uac8c id \ub97c \ucc3e\uc544\uc11c \uc5c5\ub370\uc774\ud2b8\ud558\uc9c0 \uc54a\uc544\ub3c4 \ub418\ub3c4\ub85d \ud55c\ub2e4.<\/p>\n<pre><code class=\"language-py\">    actions = [\n        {\n            &quot;_index&quot;: &quot;issue-v0.1.4&quot;,\n            &quot;_type&quot;: &quot;_doc&quot;,\n            &quot;_id&quot;: iss[i][&#039;Issue Number&#039;],\n            &quot;_source&quot;: iss[i]\n        }\n        for i in range(len(iss))\n    ]<\/code><\/pre>\n<h2>\ucd08\uae30 \uc801\uc7ac \ubc0f \uc804\uccb4 \uc5c5\ub370\uc774\ud2b8<\/h2>\n<p>\uae30\uc874\uc5d0 \uc874\uc7ac \ud558\ub294 <code>_id<\/code> \uc640 \uac19\uc740 \uac12\uc73c\ub85c \ubb38\uc11c\uac00 \uc0c9\uc778\ub418\uba74 update \ucc98\ub9ac\uac00 \ub418\uae30 \ub54c\ubb38\uc5d0 \ucd08\uae30 \uc801\uc7ac\ub294 \ubcc4\ub3c4\ub85c \ub9cc\ub4e4 \ud544\uc694\ub294 \uc5c6\uc774 updateAll \ud558\ub098\uc758 \ud568\uc218\ub85c \ub9cc\ub4e4\uc5c8\ub2e4. <code>MaxFromIMS<\/code> \ud568\uc218\uc5d0\uc11c \ud604\uc7ac \uac00\uc7a5 \ucd5c\uadfc\uc5d0 \ub4f1\ub85d\ub41c IMS \ubc88\ud638\ub97c \uc54c\uc544\ub0b4\uace0 \uc774\ub97c 1\ubc88 \uc774\uc288\ubd80\ud130 <code>_BATCH_SIZE<\/code> \ub2e8\uc704\ub85c \ub04a\uc5b4 \ub124 \uac1c\uc758 \uc4f0\ub808\ub4dc\uc5d0 \uc791\uc5c5\uc744 \ubd84\ubc30\uc2dc\ucf1c\uc900\ub2e4. \uac01 \uc4f0\ub808\ub4dc\ub294 \uc55e\uc11c \ub9cc\ub4e0 IndexIssueList \uc791\uc5c5\uc744 \uc218\ud589\ud558\uba70 _BATCH_SIZE \ud06c\uae30 \ub9cc\ud07c\uc758 \uc774\uc288 \uac2f\uc218\ub97c \uc0c9\uc778\ud558\uace0 \uc885\ub8cc\ub41c\ub2e4.<\/p>\n<pre><code class=\"language-python\">_URL_ISSLIST        = &quot;https:\/\/ims.tmaxsoft.com\/tody\/ims\/issue\/issueList.do&quot;\n_SLCT_ISSLIST       = &quot;#IssueListForm&quot;\n_BATCH_SIZE         = 50\n_KEEP_ALIVE_LIMIT   = &#039;30s&#039;\n\ndef MaxFromIMS():\n    from selenium import webdriver\n    from bs4 import BeautifulSoup\n\n    driver = webdriver.PhantomJS(&#039;.\/webdriver\/phantomjs&#039;,\n        service_log_path=&#039;.\/logs\/ghostdriver.log&#039;)\n    driver.implicitly_wait(1)\n\n    driver.get(_URL_LOGIN)\n    driver.find_element_by_name(&#039;id&#039;).send_keys(_LOGIN_ID)\n    driver.find_element_by_name(&#039;password&#039;).send_keys(_LOGIN_PASS)\n    driver.find_element_by_css_selector(_SLCT_LOGIN_INPUT).click()\n\n    driver.get(_URL_ISSLIST)\n    html = driver.page_source\n    soup = BeautifulSoup(html,&#039;html.parser&#039;)\n\n    max_iss = soup.select(_SLCT_ISSLIST)[0].findAll(&#039;table&#039;,attrs={&#039;bgcolor&#039;:&#039;#FFFFFF&#039;})[0].findAll(&#039;tr&#039;)[1].findAll(&#039;td&#039;)[0].text.strip()\n\n    driver.close()\n\n    return int(max_iss)\n\ndef UpdateAll():\n    import threading\n    from time import sleep\n\n    start       = 1\n    end         = MaxFromIMS()\n    jobs = list(range(start,end+1))\n    while jobs:\n        buf = []\n        if threading.active_count() &lt; 5:\n            while len(buf) &lt; _BATCH_SIZE:\n                if len(jobs) == 0: break\n                buf.append(jobs.pop(0))\n            threading.Thread(target=IndexIssueList,args=(buf,)).start()\n        sleep(1)<\/code><\/pre>\n<h2>\ubbf8\uc885\ub8cc \uc774\uc288\uc5d0 \ub300\ud55c \uc7ac\uc218\uc9d1<\/h2>\n<p>IMS \uc2dc\uc2a4\ud15c\uc740 \ucd08\ub2e8\uc704\uc758 \uac31\uc2e0 \uc791\uc5c5\uc774 \uc774\ub904\uc9c0\uc9c0\ub294 \uc54a\uace0 \uadf8\uc815\ub3c4\ub85c \uac80\uc0c9\uc5d4\uc9c4\uc5d0\uc11c \ubc18\uc601\ud574\uc904 \ud544\uc694\ub294 \uc5c6\uc9c0\ub9cc, \uc77c \ub2e8\uc704\ub85c \ubcf4\uba74 \uc81c\ubc95 \ub9ce\uc740 \uac31\uc2e0\uc774 \uc774\ub904\uc9c0\uace0 \uc788\ub2e4. \uc2dc\uc2dc\uac01\uac01 \ubcc0\ud558\ub294 \ub370\uc774\ud130\ub97c \uac80\uc0c9\uc5d4\uc9c4\uc5d0 \ucd5c\uc2e0\ud654 \ud558\uc5ec \ubc18\uc601\ud558\uae30 \uc704\ud574 \ud574\ub2f9 \uc2dc\uc2a4\ud15c\uc758 DB\uc5d0 \uc9c1\uc811 \uc811\uadfc\ud560 \uc218 \uc788\ub2e4\uba74 \uac00\uc7a5 \ud6a8\uc728\uc801\uc774\uaca0\uc9c0\ub9cc \uadf8\ub7f4 \uad8c\ud55c\uc740 \uc5c6\uae30\ub54c\ubb38\uc5d0 \ud558\ub8e8 \ud55c\ubc88 \uc815\ub3c4 \uac31\uc2e0\ub420 \uac00\ub2a5\uc131\uc774 \uc788\ub294 \ubb38\uc11c\ub4e4\uc744 \ub2e4\uc2dc \uc7ac\uc218\uc9d1\ud574\uc8fc\ub294 \uac83\uc73c\ub85c.. \uc774 \uc7ac\uc218\uc9d1 \ub300\uc0c1\uc744 \uc804\uccb4\ub85c \ud558\uae30\uc5d0\ub294 \ubd80\ub2f4\uc2a4\ub7fd\uae30 \ub54c\ubb38\uc5d0 \ucd5c\uc18c\ud654\ud558\uae30 \uc704\ud574 \uc885\ub8cc\ub418\uc9c0 \uc54a\uc740 \uc774\uc288\ub97c Scroll API \uc774\uc6a9\ud574\uc11c \ud30c\uc545\ud55c\ub2e4.<\/p>\n<pre><code class=\"language-python\">def GetIssNumberNotClosed():\n    body = { \n            &quot;_source&quot;:[&quot;Issue Number&quot;,&quot;Status&quot;],\n            &quot;query&quot; : { \n                &quot;bool&quot;:{\n                    &quot;must_not&quot;:[\n                        {\n                            &quot;term&quot;:{\n                                &quot;Status.keyword&quot;:&quot;Closed&quot;\n                            }\n                        },\n                        {\n                            &quot;term&quot;:{\n                                &quot;Status.keyword&quot;:&quot;Closed_P&quot;\n                            }\n                        },\n                        {\n                            &quot;term&quot;:{\n                                &quot;Status.keyword&quot;:&quot;Rejected&quot;\n                            }\n                        },\n                        {\n                            &quot;term&quot;:{\n                                &quot;Status.keyword&quot;:&quot;Rejected (\uc5c6\uc74c)&quot;\n                            }\n                        },\n                        {\n                            &quot;term&quot;:{\n                                &quot;Status.keyword&quot;:&quot;Prevented&quot;\n                            }\n                        },\n                        {\n                            &quot;term&quot;:{\n                                &quot;Status.keyword&quot;:&quot;Resolved&quot;\n                            }\n                        }\n                    ]\n                }\n            },\n            &quot;size&quot;:10\n        }\n\n    from elasticsearch import Elasticsearch\n    es_client = Elasticsearch([&quot;localhost:9200&quot;],timeout=300)\n    response = es_client.search(\n        index = &#039;issue-v0.1.4&#039;,\n        scroll = _KEEP_ALIVE_LIMIT,\n        size = 100,\n        body = body\n        )\n\n    sid = response[&#039;_scroll_id&#039;]\n    fetched = len(response[&#039;hits&#039;][&#039;hits&#039;])\n    fcnt = fetched\n\n    nums = []\n    for i in range(fetched):\n        nums.append(int(response[&#039;hits&#039;][&#039;hits&#039;][i][&#039;_source&#039;][&#039;Issue Number&#039;]))\n\n    while(fetched&gt;0):\n        response = es_client.scroll(scroll_id=sid, scroll=_KEEP_ALIVE_LIMIT)\n        sid = response[&#039;_scroll_id&#039;]\n        fetched = len(response[&#039;hits&#039;][&#039;hits&#039;])\n        for i in range(fetched):\n            nums.append(int(response[&#039;hits&#039;][&#039;hits&#039;][i][&#039;_source&#039;][&#039;Issue Number&#039;]))\n\n    return nums\n\ndef UpdateNotClosed():\n    import threading\n    jobs = GetIssNumberNotClosed()\n    while jobs:\n        buf = []\n        if threading.active_count() &lt; 5:\n            while len(buf) &lt; 50:\n                if len(jobs) == 0: break\n                buf.append(jobs.pop(0))\n            threading.Thread(target=IndexIssueList,args=(buf,)).start()<\/code><\/pre>\n<h2>\uc0c8\ub85c\uc6b4 \uc774\uc288 \uc218\uc9d1<\/h2>\n<p>\uc0c8\ub85c\uc6b4 \uc774\uc288\ub294 \uc218\uc9d1\ub41c \uc774\uc288 \ubc88\ud638\uc758 \ucd5c\ub300 \uac12\uc5d0\uc11c\ubd80\ud130 IMS\uc5d0 \ub4f1\ub85d\ub41c \uc774\uc288\ubc88\ud638\uc758 \ucd5c\ub300\uac12\uae4c\uc9c0\uc778\ub370 MaxFromES \uc5d0\uc11c \uc218\uc9d1\ud55c \ucd5c\ub300 \ubc88\ud638\ub97c \ubc1b\uc544\ub0b4\uace0 \uc5ec\uae30\uc11c\ubd80\ud130 MaxFromIMS \uae4c\uc9c0 \uc218\uc9d1\ud558\ub294 Renew \ud568\uc218\ub97c \uc0dd\uac01\ud574\ubcfc \uc218 \uc788\ub2e4. \uc774\ub294 \uc2dc\uac04\ub2f9 \uba87 \uac1c\uc529\ub9cc \ub4f1\ub85d\ub418\uc5b4\uc11c \uc9e7\uc740 \uc8fc\uae30\ub85c \uc218\uc9d1\ud574\ub3c4 \ub420 \uac83 \uac19\ub2e4.<\/p>\n<pre><code class=\"language-py\">def MaxFromES():\n    from elasticsearch import Elasticsearch\n\n    es = Elasticsearch(&quot;localhost:9200&quot;,timeout=10)\n    body = {\n            &quot;size&quot;:0,\n            &quot;aggs&quot; : {\n                    &quot;max_iss&quot; : { &quot;max&quot; : { &quot;field&quot; : &quot;Issue Number&quot; } }\n            }\n    }\n    res = es.search(index=&quot;issue-v0.1.4&quot;, body=body)\n\n    return int(res[&#039;aggregations&#039;][&#039;max_iss&#039;][&#039;value&#039;])\n\ndef Renew():\n    from_number = MaxFromES()+1\n    to_number = MaxFromIMS()+1\n    IndexIssueList(range(from_number,to_number))<\/code><\/pre>\n<p>\uc2e4\ud589\ud30c\uc77c\ub85c \ub9cc\ub4e4\uae30 \uc704\ud574 \uba54\uc778 \ud568\uc218\uc5d0\uc11c\ub294 \uc544\ub798\uc640 \uac19\uc544 \ucee4\ub9e8\ub4dc\ub77c\uc778 \uc785\ub825\uc744 \ubc1b\uc544\uc11c \ub3cc\ub824\ubcfc \uc218 \uc788\ub2e4.<\/p>\n<pre><code class=\"language-py\">#!\/Library\/Frameworks\/Python.framework\/Versions\/3.7\/bin\/python3\n\nif __name__ == &#039;__main__&#039;:\n    import sys\n    mode = sys.argv[1]\n\n    if mode == &#039;renew&#039;:             Renew()\n    elif mode == &#039;updateNotClosed&#039;: UpdateNotClosed()\n    elif mode == &#039;updateAll&#039;:       UpdateAll()\n    elif mode == &#039;single&#039;:          IndexIssueList([int(sys.argv[2])])\n    elif mode == &#039;daemon&#039;:\n        from datetime import datetime\n        from time import sleep\n        while True:\n            sleep(60)\n            now = datetime.now()\n            if now.hour % 24 == 17 and now.minute % 60 == 30: updateNotClosed()\n            if now.minute % 10 == 0: renew()<\/code><\/pre>\n<h2>\uc804\uccb4 \uc18c\uc2a4\ucf54\ub4dc<\/h2>\n<p>\uc704\uc5d0 \ubd80\ubd84\ub4e4 \ud569\uccd0\ub193\uc740 \uac83\uc5d0, IndexIssueList \ud568\uc218\uc5d0 <code>print(&#039;try to crawl issue&#039;, il)<\/code> \ubd80\ubd84\ub9cc \ucd94\uac00\ud588\ub2e4. \ucd08\uae30 \uc801\uc7ac\uc2dc\uc5d0\ub294 <code>.\/crawl.py updateAll<\/code> \uacfc \uac19\uc774 \uc2e4\ud589\ud574\uc8fc\uba74 \ub418\uace0 \ub098\uba38\uc9c0\ub3c4 \ube44\uc2b7\ud558\uac8c \uc0ac\uc6a9\ud558\uba74 \ub41c\ub2e4.<\/p>\n<pre><code class=\"language-python\">#!\/Library\/Frameworks\/Python.framework\/Versions\/3.7\/bin\/python3\n\n_LOGIN_ID           = &quot;id&quot;\n_LOGIN_PASS         = &quot;pw&quot;\n_URL_LOGIN          = &quot;https:\/\/ims.tmaxsoft.com\/tody\/auth\/login.do&quot;\n_URL_ISSVIEW        = &quot;https:\/\/ims.tmaxsoft.com\/tody\/ims\/issue\/issueView.do?issueId=&quot;\n_SLCT_LOGIN_INPUT   = &quot;body &gt; form &gt; table &gt; tbody &gt; tr &gt; td &gt; table &gt; tbody &gt; tr:nth-child(2) &gt; td:nth-child(1) &gt; table &gt; tbody &gt; tr &gt; td:nth-child(2) &gt; table &gt; tbody &gt; tr &gt; td:nth-child(2) &gt; table &gt; tbody &gt; tr &gt; td:nth-child(3) &gt; input&quot;\n_SLCT_SUBJ          = &quot;body &gt; div:nth-of-type(2) &gt; table &gt; tbody &gt; tr &gt; td:nth-of-type(2) &gt; table &gt; tbody &gt; tr:nth-of-type(2) &gt; td &gt; table:nth-of-type(1) &gt; tbody &gt; tr &gt; td &gt; table &gt; tbody &gt; tr &gt; td:nth-of-type(2)&quot;\n_SLCT_DTLS          = &quot;#IssueDescriptionDiv &gt; table &gt; tbody &gt; tr &gt; td&quot;\n_SLCT_IINFO         = &quot;#issueInfoTable&quot;\n\ndef IndexIssueList(issueList):\n    import re\n    from selenium import webdriver\n    from bs4 import BeautifulSoup\n\n    driver = webdriver.PhantomJS(&#039;.\/webdriver\/phantomjs&#039;,\n        service_log_path=&#039;.\/logs\/ghostdriver.log&#039;)\n    driver.implicitly_wait(1)\n\n    driver.get(_URL_LOGIN)\n    driver.find_element_by_name(&#039;id&#039;).send_keys(_LOGIN_ID)\n    driver.find_element_by_name(&#039;password&#039;).send_keys(_LOGIN_PASS)\n    driver.find_element_by_css_selector(_SLCT_LOGIN_INPUT).click()\n\n    iss = []\n    for il in issueList:\n        print(&#039;try to crawl issue&#039;, il)\n        try:\n            data = {}\n            driver.get(_URL_ISSVIEW+str(il))\n            html = driver.page_source\n            soup = BeautifulSoup(html,&#039;html.parser&#039;)\n\n            subj = soup.select(_SLCT_SUBJ)[0].text.strip()\n            data.update({&#039;Subject&#039;:subj})\n\n            iinfo = soup.select(_SLCT_IINFO)[0].findAll(&#039;tr&#039;)\n            for b in iinfo:\n                key,val = &#039;&#039;,&#039;&#039;\n                for i in b.findAll(&#039;td&#039;,attrs={&#039;class&#039;:re.compile(r&#039;title|data&#039;)}):\n                    if i[&#039;class&#039;][0] == &#039;title&#039;: key = i.text.strip()\n                    else: val = i.text.strip()\n                data.update({key:re.sub(&#039;[\\n\\t ]+&#039;,&#039; &#039;,val)})\n\n            if data[&#039;Closed Date&#039;] == &#039;&#039;:\n                data[&#039;Closed Date&#039;]=&#039;1970\/01\/01 09:00:00&#039;\n            if data[&#039;Date of final order&#039;] == &#039;&#039;:\n                data[&#039;Date of final order&#039;]=&#039;1970\/01\/01 09:00:00&#039;\n\n            dtls = soup.select(_SLCT_DTLS)\n            data.update({&#039;Issue Details&#039;:re.sub(&#039;[\\n\\t ]+&#039;,&#039; &#039;,dtls[0].text.strip())})\n\n            comments,tmp = soup.findAll(&#039;div&#039;,attrs={&#039;class&#039;:&#039;commDescTR data&#039;}),&quot;&quot;\n            for c in comments: tmp += c.text.strip()+&#039;\\n&#039;\n            data.update({&#039;Action Log&#039;:tmp})\n            iss.append(data)\n        except IndexError as error:\n            logger.info(error)\n            continue\n\n    driver.close()\n\n    for i in range(len(iss)):\n        iss[i].update({&#039;Issue Number&#039;:re.search(r&#039;\\d+&#039;,iss[i][&#039;Issue Number&#039;]).group()})\n\n    actions = [\n        {\n            &quot;_index&quot;: &quot;issue-v0.1.4&quot;,\n            &quot;_type&quot;: &quot;_doc&quot;,\n            &quot;_id&quot;: iss[i][&#039;Issue Number&#039;],\n            &quot;_source&quot;: iss[i]\n        }\n        for i in range(len(iss))\n    ]\n\n    from elasticsearch import Elasticsearch\n    from elasticsearch import helpers\n    es_client = Elasticsearch([&quot;localhost:9200&quot;],timeout=300)\n    res = helpers.bulk(es_client, actions)\n\n_URL_ISSLIST        = &quot;https:\/\/ims.tmaxsoft.com\/tody\/ims\/issue\/issueList.do&quot;\n_SLCT_ISSLIST       = &quot;#IssueListForm&quot;\n_BATCH_SIZE         = 50\n_KEEP_ALIVE_LIMIT   = &#039;30s&#039;\n\ndef MaxFromIMS():\n    from selenium import webdriver\n    from bs4 import BeautifulSoup\n\n    driver = webdriver.PhantomJS(&#039;.\/webdriver\/phantomjs&#039;,\n            service_log_path=&#039;.\/logs\/ghostdriver.log&#039;)\n    driver.implicitly_wait(1)\n\n    driver.get(_URL_LOGIN)\n    driver.find_element_by_name(&#039;id&#039;).send_keys(_LOGIN_ID)\n    driver.find_element_by_name(&#039;password&#039;).send_keys(_LOGIN_PASS)\n    driver.find_element_by_css_selector(_SLCT_LOGIN_INPUT).click()\n\n    driver.get(_URL_ISSLIST)\n    html = driver.page_source\n    soup = BeautifulSoup(html,&#039;html.parser&#039;)\n\n    max_iss = soup.select(_SLCT_ISSLIST)[0].findAll(&#039;table&#039;,attrs={&#039;bgcolor&#039;:&#039;#FFFFFF&#039;})[0].findAll(&#039;tr&#039;)[1].findAll(&#039;td&#039;)[0].text.strip()\n\n    driver.close()\n\n    return int(max_iss)\n\ndef UpdateAll():\n    import threading\n    from time import sleep\n\n    start       = 1\n    end         = MaxFromIMS()\n    jobs = list(range(start,end+1))\n    while jobs:\n        buf = []\n        if threading.active_count() &lt; 5:\n            while len(buf) &lt; _BATCH_SIZE:\n                if len(jobs) == 0: break\n                buf.append(jobs.pop(0))\n            threading.Thread(target=IndexIssueList,args=(buf,)).start()\n        sleep(1)\n\ndef GetIssNumberNotClosed():\n    body = { \n            &quot;_source&quot;:[&quot;Issue Number&quot;,&quot;Status&quot;],\n            &quot;query&quot; : { \n                &quot;bool&quot;:{\n                    &quot;must_not&quot;:[\n                        {\n                            &quot;term&quot;:{\n                                &quot;Status.keyword&quot;:&quot;Closed&quot;\n                            }\n                        },\n                        {\n                            &quot;term&quot;:{\n                                &quot;Status.keyword&quot;:&quot;Closed_P&quot;\n                            }\n                        },\n                        {\n                            &quot;term&quot;:{\n                                &quot;Status.keyword&quot;:&quot;Rejected&quot;\n                            }\n                        },\n                        {\n                            &quot;term&quot;:{\n                                &quot;Status.keyword&quot;:&quot;Rejected (\uc5c6\uc74c)&quot;\n                            }\n                        },\n                        {\n                            &quot;term&quot;:{\n                                &quot;Status.keyword&quot;:&quot;Prevented&quot;\n                            }\n                        },\n                        {\n                            &quot;term&quot;:{\n                                &quot;Status.keyword&quot;:&quot;Resolved&quot;\n                            }\n                        }\n                    ]\n                }\n            },\n            &quot;size&quot;:10\n        }\n\n    from elasticsearch import Elasticsearch\n    es_client = Elasticsearch([&quot;localhost:9200&quot;],timeout=300)\n    response = es_client.search(\n        index = &#039;issue-v0.1.4&#039;,\n        scroll = _KEEP_ALIVE_LIMIT,\n        size = 100,\n        body = body\n        )\n\n    sid = response[&#039;_scroll_id&#039;]\n    fetched = len(response[&#039;hits&#039;][&#039;hits&#039;])\n    fcnt = fetched\n\n    nums = []\n    for i in range(fetched):\n        nums.append(int(response[&#039;hits&#039;][&#039;hits&#039;][i][&#039;_source&#039;][&#039;Issue Number&#039;]))\n\n    while(fetched&gt;0):\n        response = es_client.scroll(scroll_id=sid, scroll=_KEEP_ALIVE_LIMIT)\n        sid = response[&#039;_scroll_id&#039;]\n        fetched = len(response[&#039;hits&#039;][&#039;hits&#039;])\n        for i in range(fetched):\n            nums.append(int(response[&#039;hits&#039;][&#039;hits&#039;][i][&#039;_source&#039;][&#039;Issue Number&#039;]))\n\n    return nums\n\ndef UpdateNotClosed():\n    import threading\n    jobs = GetIssNumberNotClosed()\n    while jobs:\n        buf = []\n        if threading.active_count() &lt; 5:\n            while len(buf) &lt; 50:\n                if len(jobs) == 0: break\n                buf.append(jobs.pop(0))\n            threading.Thread(target=IndexIssueList,args=(buf,)).start()\n\ndef MaxFromES():\n    from elasticsearch import Elasticsearch\n\n    es = Elasticsearch(&quot;localhost:9200&quot;,timeout=10)\n    body = {\n                    &quot;size&quot;:0,\n                    &quot;aggs&quot; : {\n                                    &quot;max_iss&quot; : { &quot;max&quot; : { &quot;field&quot; : &quot;Issue Number&quot; } }\n                    }\n    }\n    res = es.search(index=&quot;issue-v0.1.4&quot;, body=body)\n\n    return int(res[&#039;aggregations&#039;][&#039;max_iss&#039;][&#039;value&#039;])\n\ndef Renew():\n    from_number = MaxFromES()+1\n    to_number = MaxFromIMS()+1\n    IndexIssueList(range(from_number,to_number))\n\nif __name__ == &#039;__main__&#039;:\n    import sys\n    mode = sys.argv[1]\n\n    if mode == &#039;renew&#039;:             Renew()\n    elif mode == &#039;updateNotClosed&#039;: UpdateNotClosed()\n    elif mode == &#039;updateAll&#039;:       UpdateAll()\n    elif mode == &#039;single&#039;:          IndexIssueList([int(sys.argv[2])])\n    elif mode == &#039;daemon&#039;:\n        from datetime import datetime\n        from time import sleep\n        while True:\n            sleep(60)\n            now = datetime.now()\n            if now.hour % 24 == 17 and now.minute % 60 == 30: updateNotClosed()\n            if now.minute % 10 == 0: renew()<\/code><\/pre>\n","protected":false},"excerpt":{"rendered":"<p>[ElasticSearch] \uac80\uc0c9\uc5d4\uc9c4 \ub9cc\ub4e4\uae30 2 &#8211; IMS \ub370\uc774\ud130 \ud06c\ub864\ub9c1 Selenium, BeautifulSoup \ub97c \uc774\uc6a9\ud574\uc11c IMS \uc774\uc288 \ud398\uc774\uc9c0 Crawler \ub97c \ub9cc\ub4e0\ub2e4. \uae30\ucd08\uc801\uc778 \ubd80\ubd84\uc740 [\uc5ec\uae30]()\ub97c \ucc38\uace0. \ub2e8\uc704 \ud06c\ub864\ub7ec \uc218\uc9d1\ud560 \uc774\uc288 \ubc88\ud638\uc5d0 \ub300\ud55c \ub9ac\uc2a4\ud2b8\ub97c \ub9cc\ub4e4\uc5b4 \ud55c\ubc88\uc5d0 \uc218\uc9d1\ud558\uace0 Elasticsearch\uc5d0 \uc778\ub371\uc2f1 \ud558\ub294 IndexIssueList \ud568\uc218\ub97c \ub9cc\ub4e4\uc5c8\ub2e4. (IMS \ub294 \uc774\uc288 \ubc88\ud638\ub97c URL(https:\/\/ims.tmaxsoft.com\/tody\/ims\/issue\/issueView.do?issueId=) \uc5d0 \ubcc0\uc218\ub85c \ub300\uc785\ud574\uc11c \uac01 \uc774\uc288 \ud398\uc774\uc9c0\uc5d0 \uc811\uadfc\ud560 \uc218 \uc788\ub2e4.) _LOGIN_ID = &quot;IMS_USERNAME&quot; [&hellip;]<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[12,10],"tags":[],"class_list":["post-1134","post","type-post","status-publish","format-standard","hentry","category-elasticsearch","category-python"],"_links":{"self":[{"href":"https:\/\/oboki.net\/workspace\/wp-json\/wp\/v2\/posts\/1134","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/oboki.net\/workspace\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/oboki.net\/workspace\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/oboki.net\/workspace\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/oboki.net\/workspace\/wp-json\/wp\/v2\/comments?post=1134"}],"version-history":[{"count":5,"href":"https:\/\/oboki.net\/workspace\/wp-json\/wp\/v2\/posts\/1134\/revisions"}],"predecessor-version":[{"id":1151,"href":"https:\/\/oboki.net\/workspace\/wp-json\/wp\/v2\/posts\/1134\/revisions\/1151"}],"wp:attachment":[{"href":"https:\/\/oboki.net\/workspace\/wp-json\/wp\/v2\/media?parent=1134"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/oboki.net\/workspace\/wp-json\/wp\/v2\/categories?post=1134"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/oboki.net\/workspace\/wp-json\/wp\/v2\/tags?post=1134"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}