Welcome to mirror list, hosted at ThFree Co, Russian Federation.

wikitravel-optimize-articles.py « crawler - github.com/mapsme/omim.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: afa7c5d71bcc0bf282295c15d1fc8158782d51ef (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
#!/opt/local/bin/python
import json
import os
import re
import string
import sys

myPath = os.path.dirname(os.path.realpath(__file__))

def formatPath(s):
  return s.replace('(', '\\(').replace(')', '\\)')

for i, line in enumerate(sys.stdin):
  (url, title, fileBase) = json.loads(line)
  fileName = fileBase + '.article'
  outFileName = fileName + '.opt'
  if os.path.exists(outFileName):
    sys.stderr.write('Skipping existing {0} {1}\n'.format(i, fileName))
  else:
    sys.stderr.write('Optimizing {0} {1}\n'.format(i, fileName))
    assert 0 == os.system('java -jar {myPath}/htmlcompressor.jar '
                 '--remove-intertag-spaces --simple-bool-attr --remove-quotes '
                  '--remove-js-protocol --type html '
                '-o {outFileName} {fileName}'
                 .format(myPath = myPath,
                         fileName = formatPath(fileName),
                         outFileName = formatPath(outFileName)))