Skip to content

Commit 7345234

Browse files
author
root
committed
quick read
1 parent 74ceed9 commit 7345234

File tree

8 files changed

+148
-22
lines changed

8 files changed

+148
-22
lines changed

README.md

+14-1
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,14 @@ SUPERVISORD_PASSWD=something
3838

3939
Run `bash bin/create_db.sh`
4040

41+
42+
### Install Quick Read dependencies
43+
44+
1. install cpan
45+
2. install text::Unidecode in cpan
46+
3. git clone https://github.com/brucemiller/LaTeXML
47+
4. perl Makefile.PL; make; make install
48+
4149
### Fetch resources
4250

4351
Fetch Arxiv papers and tweets.
@@ -62,8 +70,13 @@ PYTHONPATH="." python dlmonitor/webapp/app.py
6270
bash bin/config_server.sh
6371
```
6472

65-
2. Start Gunicorn processes through supervisord
73+
3. Start Gunicorn processes through supervisord
6674

6775
```bash
6876
bash bin/start_supervisord.sh
6977
```
78+
4. Start arxiv source loading worker
79+
80+
```bash
81+
PYTHONPATH="." python bin/auto_load_arxiv.py --forever
82+
```

alembic/versions/fb7131fc3951_.py

+33
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
"""empty message
2+
3+
Revision ID: fb7131fc3951
4+
Revises: 220426586e09
5+
Create Date: 2019-06-14 10:51:38.034897
6+
7+
"""
8+
from alembic import op
9+
import sqlalchemy as sa
10+
11+
12+
# revision identifiers, used by Alembic.
13+
revision = 'fb7131fc3951'
14+
down_revision = '220426586e09'
15+
branch_labels = None
16+
depends_on = None
17+
18+
19+
def upgrade():
20+
# ### commands auto generated by Alembic - please adjust! ###
21+
op.create_table('working',
22+
sa.Column('id', sa.Integer(), autoincrement=True, nullable=False),
23+
sa.Column('type', sa.String(length=255), nullable=True),
24+
sa.Column('param', sa.String(length=255), nullable=True),
25+
sa.PrimaryKeyConstraint('id')
26+
)
27+
# ### end Alembic commands ###
28+
29+
30+
def downgrade():
31+
# ### commands auto generated by Alembic - please adjust! ###
32+
op.drop_table('working')
33+
# ### end Alembic commands ###

bin/auto_load_arxiv.py

+32
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
import sys, os
2+
sys.path.append(".")
3+
from dlmonitor.db import session_scope
4+
from dlmonitor.db_models import WorkingQueueModel
5+
from dlmonitor.latex import build_paper_html, retrieve_paper_html
6+
from dlmonitor.settings import PDF_PATH
7+
from sqlalchemy import desc
8+
from argparse import ArgumentParser
9+
import logging
10+
import time
11+
logging.basicConfig(level=logging.INFO)
12+
13+
if __name__ == '__main__':
14+
ap = ArgumentParser()
15+
ap.add_argument("--forever", action="store_true")
16+
args = ap.parse_args()
17+
with session_scope() as session:
18+
run_flag = True
19+
while run_flag:
20+
jobs = session.query(WorkingQueueModel).filter(WorkingQueueModel.type == "load_arxiv").limit(10).all()
21+
logging.info("get {} jobs".format(len(jobs)))
22+
for job in jobs:
23+
arxiv_token = job.param
24+
build_paper_html(arxiv_token)
25+
logging.info("built {}".format(arxiv_token))
26+
session.delete(job)
27+
session.commit()
28+
if not jobs:
29+
time.sleep(3)
30+
if not args.forever:
31+
# beak
32+
run_flag = False

dlmonitor/db_models.py

+12-1
Original file line numberDiff line numberDiff line change
@@ -58,5 +58,16 @@ class TwitterModel(Base):
5858
search_vector = Column(TSVectorType('text'))
5959

6060
def __repr__(self):
61-
template = '<Arxiv(id="{0}", user_name="{1}")>'
61+
template = '<Twitter(id="{0}", user_name="{1}")>'
6262
return str_repr(template.format(self.id, self.user))
63+
64+
class WorkingQueueModel(Base):
65+
66+
__tablename__ = "working"
67+
68+
id = Column(Integer, primary_key=True, autoincrement=True)
69+
type = Column(String(255), nullable=True)
70+
param = Column(String(255), nullable=True)
71+
72+
def __repr__(self):
73+
return __tablename__ + self.id

dlmonitor/latex.py

+16-11
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
import urllib2
22
import os
3+
import subprocess
34

45
from dlmonitor import settings
56

67
def build_paper_html(arxiv_id):
78
src_path = "{}/{}".format(settings.SOURCE_PATH, arxiv_id)
89
html_path = "{}/main.html".format(src_path)
9-
return False
1010
if os.path.exists(src_path):
1111
return html_path if os.path.exists(html_path) else None
1212
opener = urllib2.build_opener()
@@ -17,7 +17,7 @@ def build_paper_html(arxiv_id):
1717
if (int(file_size) / 1024. / 1024. > 15):
1818
# File too big
1919
os.mkdir(src_path)
20-
return False
20+
return None
2121
print("download {}: {}".format(arxiv_id, file_size))
2222
data = page.read()
2323
os.mkdir(src_path)
@@ -26,23 +26,28 @@ def build_paper_html(arxiv_id):
2626
os.chdir(src_path)
2727
os.system("tar xzf {} --directory {}".format(tgz_path, src_path))
2828
texfiles = [fn for fn in os.listdir(src_path) if fn.endswith(".tex")]
29-
select_texfile = texfiles[0]
30-
if len(texfiles) > 1:
31-
for fn in texfiles:
32-
text = open("{}/{}".format(src_path, fn)).read()
33-
if "begin{document}" in text:
34-
select_texfile = fn
35-
break
3629
if texfiles:
37-
os.system("latexml --includestyles --dest=main.xml {}".format(select_texfile.replace(".tex", "")))
30+
select_texfile = texfiles[0]
31+
if len(texfiles) > 1:
32+
for fn in texfiles:
33+
text = open("{}/{}".format(src_path, fn)).read()
34+
if "begin{document}" in text:
35+
select_texfile = fn
36+
break
37+
cmd = "latexml --includestyles --dest=main.xml {}".format(select_texfile.replace(".tex", ""))
38+
os.system(cmd)
39+
os.system("latexmlpost --dest=main.html main.xml")
3840
os.system("latexmlpost --dest=main.html main.xml")
3941
os.remove(tgz_path)
42+
open("{}/.loaded".format(src_path), "wb").write("loaded")
4043
return html_path if os.path.exists(html_path) else None
4144

4245
def retrieve_paper_html(arxiv_token):
4346
src_path = "{}/{}".format(settings.SOURCE_PATH, arxiv_token)
4447
html_path = "{}/main.html".format(src_path)
45-
if os.path.exists(src_path) and not os.path.exists(html_path):
48+
if os.path.exists(src_path) and not os.path.exists("{}/.loaded".format(src_path)):
49+
html_body = "PROCESSING"
50+
elif os.path.exists(src_path) and not os.path.exists(html_path):
4651
html_body = "NOT_AVAILABE"
4752
elif os.path.exists(src_path) and os.path.exists(html_path):
4853
html_body = open(html_path).read().decode("utf-8")

dlmonitor/webapp/app.py

+13-3
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import os
22
from flask import Flask, request, redirect, session, send_from_directory
33
from flask import render_template, send_from_directory
4-
from dlmonitor.db import close_global_session
4+
from dlmonitor.db import close_global_session, get_global_session
55
from dlmonitor.fetcher import get_posts
66
from dlmonitor import settings
77
from urllib2 import unquote
@@ -161,9 +161,19 @@ def save_mendeley():
161161

162162
@app.route("/load_fulltext/<arxiv_token>")
163163
def load_fulltext(arxiv_token):
164-
from dlmonitor.latex import build_paper_html, retrieve_paper_html
165-
build_paper_html(arxiv_token)
164+
from dlmonitor.db_models import WorkingQueueModel
165+
db_session = get_global_session()
166+
job = WorkingQueueModel(
167+
type="load_arxiv",
168+
param=arxiv_token
169+
)
170+
db_session.add(job)
171+
db_session.commit()
172+
return "OK"
166173

174+
@app.route("/retrieve_fulltext/<arxiv_token>")
175+
def retrieve_fulltext(arxiv_token):
176+
from dlmonitor.latex import retrieve_paper_html
167177
return retrieve_paper_html(arxiv_token)
168178

169179
@app.route("/arxiv_files/<arxiv_token>/<path:fp>")

dlmonitor/webapp/static/app.js

+23-2
Original file line numberDiff line numberDiff line change
@@ -244,14 +244,35 @@ dlmonitor.load_fulltext = function(arxiv_token) {
244244
success: function(data) {
245245
// console.log(data);
246246
dlmonitor.ajaxCount --;
247-
if (data == "NOT_AVAILABE") {
247+
$("#latex-content").data("arxiv_token", arxiv_token);
248+
setTimeout(dlmonitor.retrieve_fulltext, 3000);
249+
}
250+
});
251+
}
252+
253+
dlmonitor.retrieve_fulltext = function() {
254+
dlmonitor.ajaxCount ++;
255+
arxiv_token = $("#latex-content").data("arxiv_token")
256+
$.ajax({
257+
url: '/retrieve_fulltext/' + arxiv_token,
258+
type: 'GET',
259+
error: function() {
260+
dlmonitor.ajaxCount --;
261+
$("#latex-content").html("An error is detected when loading the paper.");
262+
},
263+
success: function(data) {
264+
// console.log(data);
265+
dlmonitor.ajaxCount --;
266+
if (data == "PROCESSING" || data == "NOT_EXIST") {
267+
setTimeout(dlmonitor.retrieve_fulltext, 3000);
268+
} else if (data == "NOT_AVAILABE") {
248269
$("#latex-content").html("This feature is not avaialbe for this paper.");
249270
} else {
250271
$("#latex-content").html(data);
251272
}
252273
}
253274
});
254-
}
275+
};
255276

256277
dlmonitor.init = function() {
257278
dlmonitor.updateAll(true);

dlmonitor/webapp/templates/single.html

+5-4
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
<meta content="Things happening in deep learning: arxiv, twitter, reddit" name="description">
88
<link href="https://fonts.googleapis.com/css?family=Source+Sans+Pro:200,300,400,600,700,900" rel="stylesheet">
99
<link href="https://bootswatch.com/3/darkly/bootstrap.min.css" rel="stylesheet">
10+
<meta content='width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=0' name='viewport' />
1011
<link href="/static/LaTeXML.css" media="all" rel="stylesheet" type="text/css">
1112
<link href="/static/ltx-article.css" media="all" rel="stylesheet" type="text/css">
1213
<link href="/static/default.css?v=15" media="all" rel="stylesheet" type="text/css">
@@ -18,7 +19,7 @@
1819
</script>
1920
<script src="https://cdnjs.cloudflare.com/ajax/libs/js-cookie/2.1.4/js.cookie.min.js" type="text/javascript">
2021
</script>
21-
<script src="/static/app.js?v=12" type="text/javascript">
22+
<script src="/static/app.js?v=13" type="text/javascript">
2223
</script>
2324
</head>
2425
<body>
@@ -54,11 +55,11 @@ <h3><a href="{{post.arxiv_url}}" target="_blank">{{ post.title }}</a></h3>
5455
<h3 style="color:#0ce3ac;">Abstract</h3>
5556
<p style="font-size: 20px;">{{ post.abstract }}</p>
5657
<p>&nbsp;</p>
57-
<h3 style="color:#0ce3ac;">Read Right Now (beta)</h3>
58+
<h3 style="color:#0ce3ac;">Quick Read (beta)</h3>
5859
<div id="latex-content">
5960
{% if html_body == "NOT_AVAILABE" %}
6061
This feature is not avaialbe for this paper.
61-
{% elif html_body == "NOT_EXIST" %}
62+
{% elif html_body == "NOT_EXIST" or html_body == "PROCESSING" %}
6263
<div class="sk-folding-cube">
6364
<div class="sk-cube1 sk-cube"></div>
6465
<div class="sk-cube2 sk-cube"></div>
@@ -100,7 +101,7 @@ <h3 style="color:#0ce3ac;">Read Right Now (beta)</h3>
100101
</div>
101102
</div>
102103
</div>
103-
{% if html_body == "NOT_EXIST" %}
104+
{% if html_body == "NOT_EXIST" or html_body == "PROCESSING" %}
104105
<script type="text/javascript" defer="defer">
105106
dlmonitor.load_fulltext("{{ arxiv_token }}");
106107
</script>

0 commit comments

Comments
 (0)