create_json.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import os
import json
from bs4 import BeautifulSoup


# Takes Hugo public directory and returns all html files
def walker(path):
    pages = []
    for root, dirs, files in os.walk(path):
        for file in files:
            if file.endswith('.html'):
                pages.append('/'.join((root, file)))
    return pages


# Takes html page and outputs json object
def parser(page):
    soup = BeautifulSoup(open(page, 'r'))
    node = {}
    try:
        node['title'] = soup.title.get_text(' ', strip=True).replace('&nbsp;', ' ').replace('^', '&#94;')
        node['loc'] = soup.link['href']
        node['text'] = soup.article.get_text(' ', strip=True).replace('^', '&#94;')
        tags = ['nonetags']
        #for a in soup.find("p", id='tags').find_all("a"):
        #    tags.append(a['href'].split('/')[-1])
        node['tags'] = ' '.join(tags)
        return node
    except:
        return None


# Json accumulator
def jsoner(nodes):
    jdata = {'pages': nodes}
    with open('public/tipuesearch_content.json', 'w') as f:
        json.dump(jdata, f)


# Sitemap generation
def sitemaper(nodes):
    xml = '''<?xml version="1.0" encoding="utf-8"?>
<urlset xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd"
xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\n'''
    url = '<url><loc>{0}</loc><changefreq>daily</changefreq><priority>0.5</priority></url>\n'
    for n in nodes:
        xml = xml + url.format(n['loc'])
    xml = xml + '\n</urlset>'
    with open('public/search/sitemap.xml', 'w') as f:
        f.write(xml)

if os.path.exists('./public'):
    pages = walker('.')
    nodes = []
    for p in pages:
        node = parser(p)
        if node:
            nodes.append(node)
    jsoner(nodes)
    sitemaper(nodes)
else:
    print 'Error: place this script in hugo site root'