harfang3d/doc/search_index.py
2021-10-13 14:40:31 +02:00

170 lines
3.9 KiB
Python

from bs4 import BeautifulSoup
import json
import os
index = []
def get_soup_title_and_category(uid, soup):
if uid == 'man.Functions':
uid = uid
title, category = None, 'other'
tutorial_header = soup.find('table', class_='tutorial-header')
is_tutorial = tutorial_header is not None
#
if title is None:
doc_details = soup.find('div', class_='doc-details')
if doc_details is not None:
h1 = doc_details.find('h1')
if h1 is not None:
title = str(h1.text)
if title is None:
breadcrumb = soup.find('div', class_='doc-breadcrumb')
if breadcrumb is not None:
links = breadcrumb.find_all('a')
if len(links) > 0:
title = str(links[-1].string)
else:
title = None
else:
h1 = soup.find('h1')
if h1 is not None:
title = h1.string
if title is not None:
title = str(title)
if title is None or title == 'Documentation':
title = uid # fallback to uid
if is_tutorial:
title = "Tutorial '%s'" % title
# determine category
if is_tutorial:
category = 'tutorial'
elif uid.startswith('man.'):
category = 'manual'
elif title.endswith('Class'):
category = 'class'
title = title[:-5].rstrip()
return title, category
def filter_and_parse(uid, version, html, filter):
soup = BeautifulSoup(html, 'html.parser')
# get title and category
title, category = get_soup_title_and_category(uid, soup)
# apply filter
soup, body = filter(soup)
if body == '':
return
# build excerpt candidates
excerpts = []
def add_excerpt(ex):
if ex is None:
return
s = str(ex).strip()
if len(s) > 0:
excerpts.append(s)
for p in soup.find_all('p'): # raw paragraph content
add_excerpt(p.string)
for div in soup.find_all('div'): # function documentation
if 'class' in div.attrs:
for class_ in div['class']:
if class_ in ['function_doc']:
add_excerpt(div.string)
# finalize entry
entry = {'uid': uid, 'category': category, 'title': title, 'body': body, 'excerpts': excerpts, 'version': str(version)}
index.append(entry)
return soup, title
def clean_text(t):
return t.lstrip().rstrip()
def extract_page_functions(uid, version, page_title, soup):
function_divs = soup.find_all('div', class_='function_div')
for function_div in function_divs:
id = function_div.get('id')
if id.startswith(uid):
name = id[len(uid)+1:]
title = '%s.%s' % (uid, name)
else:
name = id
title = page_title
body = name
function_proto = function_div.find('div', class_='function_proto')
function_doc = function_div.find('div', class_='function_doc')
excerpt = []
if function_doc:
excerpt.append(clean_text(function_doc.text))
entry = {'uid': '%s#%s' % (uid, id), 'category': 'function', 'title': title, 'body': body, 'excerpts': excerpt, 'version': str(version)}
index.append(entry)
def parse(uid, version, html):
def basic_stripper_filter(soup):
# strip breadcrumb
breadcrumb = soup.find('div', {'class': 'doc-breadcrumb'})
if breadcrumb is not None:
breadcrumb.extract()
# strip version footer
version_span = soup.find('small', {'id': 'version-footer'})
if version_span is not None:
version_span.extract()
code_divs = soup.find_all('div', {'class': 'codehilite'})
for div in code_divs:
div.extract()
# extract body
body = soup.get_text(separator=' ')
# body = ' '.join(body.split())
for s in ['(', '[', '{', '\n']:
body = body.replace(s + ' ', s)
for s in [')', ']', '}', ',', ':', ';', '\n']:
body = body.replace(' ' + s, s)
for i in range(4):
body = body.replace(' ', ' ')
body = body.replace('\n\n', '\n')
return soup, body
soup, title = filter_and_parse(uid, version, html, basic_stripper_filter)
# extract page functions
extract_page_functions(uid, version, title, soup)
def save(dir, path):
with open(os.path.join(dir, path), 'w') as file:
for entry in index:
file.write('{"index":{}}\n')
json.dump(entry, file)
file.write('\n')
file.write('{"index":{}}\n')