Proof of concept Search WebApp to show how to implement doc. search

This adds the Proof of concept WebApp to show how to glue together
the Solr search platform with COOL server with "convert-to" and
"render-search-result" REST services and combine everything into
a document search solution.

Signed-off-by: Tomaž Vajngerl <tomaz.vajngerl@collabora.co.uk>
Change-Id: Iea3a2f6e2afee090bc7a27648390025d2a8c94d8
pull/3150/head
Tomaž Vajngerl 2021-09-12 23:39:06 +09:00 committed by Tomaž Vajngerl
parent b31eb2ab92
commit 02c60302b3
9 changed files with 422 additions and 0 deletions

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

70
indexing/Main.html 100644
View File

@ -0,0 +1,70 @@
<!DOCTYPE html>
<html lang="en" ng-app="SearchApp">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>Search - Proof of Concept</title>
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.1.0/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-KyZXEAg3QhqLMpG8r+8fhAXLRk2vvoC2f3B09zVXn8CA5QIVfZOJ3BCsw2P0p/We" crossorigin="anonymous">
<script src="https://ajax.googleapis.com/ajax/libs/angularjs/1.8.2/angular.min.js"></script>
<script src="https://ajax.googleapis.com/ajax/libs/angularjs/1.8.2/angular-resource.min.js"></script>
<script src="Main.js"></script>
</head>
<body ng-controller="SearchMainController">
<div class="container">
<div class="row">
<div class="col">
<form class="row g-1 justify-content-md-center">
<div class="col-sm-6">
<input class="form-control" type="text" ng-model="searchString" placeholder="Search string..." >
</div>
<div class="col-auto">
<button type="button" ng-click="searchClicked()" ng-disabled="searchDisabled()" class="btn btn-primary mb-3">Search</button>
</div>
<div class="col-auto">
<button type="button" ng-click="reindexClicked()" class="btn btn-primary">Re-index Documents</button>
</div>
</form>
</div>
</div>
<div class="row" ng-show="searchResult.length > 0">
<div class="col"><b>Search Result</b></div>
<table class="table table-vcenter table-striped table-hover table-bordered">
<thead>
<tr>
<th class="col-xs-6">Filename</th>
<th class="col-xs-6">Image</th>
</tr>
</thead>
<tbody>
<tr>
<tr class="table-repeat" ng-repeat="result in searchResult">
<td><a href="{{result.href}}">{{result.filename}}</td>
<td><img data-ng-src="data:image/png;base64,{{result.image}}"/></td>
</tr>
</tbody>
</table>
<button type="button" ng-click="clearSearchResults()" class="btn btn-primary">Clear</button>
</div>
<div class="row" ng-show="searchResult.length == 0">
<table class="table table-vcenter table-striped table-hover table-bordered">
<thead>
<tr>
<th class="col-xs-6">Filename</th>
</tr>
</thead>
<tbody>
<tr>
<tr class="table-repeat" ng-repeat="document in documents">
<td><a href="{{document.href}}">{{document.name}}</a></td>
</tr>
</tbody>
</table>
</div>
<div class="row">
<p>Status: {{status}}</p>
</div>
</div>
</body>
</html>

93
indexing/Main.js 100644
View File

@ -0,0 +1,93 @@
var searchApp = angular.module('SearchApp', ['ngResource']);
searchApp.factory('SearchAPI', ['$http',
function ($http) {
return {
getDocuments: function () {
var promise = $http.get('/documents').then(function(response) {
return response.data;
}, function (error) {
return [];
})
return promise;
},
reindexDocuments: function () {
var promise = $http.post('/reindex').then(function(response) {
return true;
}, function (error) {
return false;
})
return promise;
},
search: function (query) {
var queryJson = { "query" : query };
var promise = $http.post('/search', queryJson).then(function(response) {
return response.data;
}, function (error) {
return null;
})
return promise;
},
getResultImage: function(jsonResult) {
var promise = $http.post('/image', jsonResult).then(function(response) {
return response.data;
}, function (error) {
return null;
})
return promise;
}
}
}
]);
searchApp.controller('SearchMainController', ['$scope', 'SearchAPI', function($scope, SearchAPI) {
$scope.searchResult = []
$scope.searchString = "";
$scope.status = "Ready";
$scope.reindexClicked = function() {
$scope.status = "Reindexing....";
SearchAPI.reindexDocuments().then(function(data) {
$scope.status = "Finished reindexing";
});
};
$scope.clearSearchResults = function() {
$scope.searchResult = []
}
$scope.searchClicked = function() {
$scope.searchResult = []
SearchAPI.search($scope.searchString).then(function(jsonResult) {
$scope.status = "Search finished " + jsonResult.length;
for (let i = 0; i < jsonResult.length; i++) {
let result = jsonResult[i];
$scope.searchResult[i] = result;
SearchAPI.getResultImage(result).then(function(image) {
result.image = image;
});
}
});
$scope.status = "Searching... " + $scope.searchString;
};
$scope.searchDisabled = function() {
return $scope.searchString == "";
};
$scope.documents = []
SearchAPI.getDocuments().then(function(data) {
$scope.documents = data;
});
$scope.documentSize = function() {
return $scope.documents.length;
};
}]);

33
indexing/README 100644
View File

@ -0,0 +1,33 @@
Searching and indexing example APP
**********************************
Main.js and Main.html - HTML/JS client side
Server.py - HTTP Server and server side processing via. REST API
Configuration
*************
Open "Server.py" and change the COOL and Solr server URL ("coolServerUrl" and "solrServerUrl") if they are different
than localhost and default ports.
The "documentPath" constant is the root location of the documents (relative where Server.py was started).
The "solrCollectionName" constant is the collection name where Solr should store the indices.
The "coolInstance" is the URL to the COOL instance, which is used to open a document.
The collection needs to be created in Solr, if it doesn't yet exists with (from Solr root):
./bin/solr create -c <collection name>
For exmaple:
./bin/solr create -c documents
HTTP Server
***********
Run http server on locally with:
"python Server.py 8000 127.0.0.1"
Then connect from the web browser to "http::/localhost:8000"
First time - run "Re-Index Documents" or nothing will be found

226
indexing/Server.py 100644
View File

@ -0,0 +1,226 @@
#!/usr/bin/env python
from http.server import SimpleHTTPRequestHandler, HTTPServer
from urllib import parse
from urllib.request import *
import argparse
import re
import json
import os
import requests
import xml.etree.ElementTree as ET
import base64
# Configuration
coolServerUrl = "http://localhost:9980"
solrServerUrl = "http://localhost:8983"
documentPath = "Docs/"
coolInstance = coolServerUrl + "/loleaflet/f6d368a0a/loleaflet.html"
solrCollectionName = "documents"
# Templates
solrSelectUrl = "{}/solr/{}/select".format(solrServerUrl, solrCollectionName)
solrUpdateUrl = "{}/solr/{}/update?commit=true".format(solrServerUrl, solrCollectionName)
# Transform the LO indexing XML structure to Solr structure
def transformToSolrFormat(xmlContent, filename):
root = ET.fromstring(xmlContent)
builder = ET.TreeBuilder()
builder.start("add", {})
for entry in root:
if entry.tag == 'paragraph' or entry.tag == 'object':
builder.start("doc", {})
builder.start("field", {"name" : "filename"})
builder.data(filename)
builder.end("filed")
builder.start("field", {"name" : "type"})
builder.data(entry.tag)
builder.end("field")
for attribute in entry.attrib:
builder.start("field", {"name" : attribute})
builder.data(entry.attrib[attribute])
builder.end("field")
builder.start("field", {"name" : "content"})
builder.data(entry.text)
builder.end("field")
builder.end("doc")
builder.end("add")
et = ET.ElementTree(builder.close())
ET.indent(et, space=" ", level=0)
return ET.tostring(et.getroot(), encoding='utf-8', xml_declaration=True)
# Create Solr XML to remove all entries from the database
def createSolrDeleteXml():
builder = ET.TreeBuilder()
builder.start("update", {})
builder.start("delete", {})
builder.start("query", {})
builder.data("*:*")
builder.end("query")
builder.end("delete")
builder.end("update")
et = ET.ElementTree(builder.close())
ET.indent(et, space=" ", level=0)
return ET.tostring(et.getroot(), encoding='utf-8', xml_declaration=True)
# Calls "Convert To - Indexing XML" service on COOL Server
def callConvertToIndexingXml(filename, filepath):
filesDict = {
'data': (filepath, open(filepath, 'rb'), None, {})
}
response = requests.post("{}/lool/convert-to/xml".format(coolServerUrl), files=filesDict)
if response.ok:
return response.content
return None
# Reindex all documents
def runReindexProcess():
headers = {'Content-Type' : 'text/xml'}
# remove existing entries from the database
requests.post(solrUpdateUrl, data=createSolrDeleteXml(), headers=headers)
# add the new indices into SOLR server
for document in getDocuments():
filename = document['name']
xmlContent = callConvertToIndexingXml(filename, documentPath + filename)
if xmlContent:
# add indexing XML values
headers = {'Content-Type' : 'text/xml'}
solrTransformed = transformToSolrFormat(xmlContent, filename)
response = requests.post(solrUpdateUrl, data=solrTransformed, headers=headers)
if not response.ok:
return False
return True
# Search/Query on Solr
def callQueryServiceOnSolr(jsonString):
searchStructure = json.loads(jsonString)
query = searchStructure['query']
response = requests.get("{}?rows=50&q=content:{}".format(solrSelectUrl, query))
result = response.json()
responseBody = result['response']
if responseBody['numFound'] > 0:
for document in responseBody['docs']:
type = document['type'][0]
filename = document['filename'][0]
href = "{}?file_path=file://{}".format(coolInstance, os.path.abspath(documentPath + filename))
if type == "paragraph":
returnMap = {
'filename' : filename,
'href' : href,
'type' : document['type'][0],
'index' : document['index'][0],
'node_type' : document['node_type'][0],
'content' : document['content'][0]
}
if 'object_name' in document:
returnMap['object_name'] = document['object_name'][0]
yield returnMap
# Gets all the available documents contained in the document path
def getDocuments():
with os.scandir(documentPath) as entries:
for entry in entries:
if entry.is_file():
yield {
"name" : entry.name,
"href" : "{}?file_path=file://{}".format(coolInstance, os.path.abspath(documentPath + entry.name))
}
# Calls "Render Search Result" service on COOL Server
# Input is search result and the document, and return the rendered image
def callRenderImageService(resultJsonString):
result = json.loads(resultJsonString)
filename = result['filename']
# Enclose json with [] - as the server supports more search results, which are then combined
resultJsonProcessed = '[ ' + resultJsonString.decode('utf-8') + ' ]'
filesDict = {
"document": (filename, open(documentPath + filename, 'rb'), None, {}),
"result" : ("json", resultJsonProcessed, None, {})
}
response = requests.post("{}/lool/render-search-result".format(coolServerUrl), files=filesDict)
return base64.b64encode(response.content)
# HTTP Server - Handle HTTP requests
class HTTPRequestHandler(SimpleHTTPRequestHandler):
def handleImageRequest(self):
jsonString = self.rfile.read(int(self.headers['Content-Length']))
imageBase64 = callRenderImageService(jsonString)
if imageBase64:
self.send_response(200)
else:
self.send_response(403)
self.end_headers()
if imageBase64:
self.wfile.write(imageBase64)
def handleReindexRequest(self):
if runReindexProcess():
self.send_response(200)
else:
self.send_response(403)
self.end_headers()
def handleSearchRequest(self):
jsonString = self.rfile.read(int(self.headers['Content-Length']))
searchResult = [i for i in callQueryServiceOnSolr(jsonString)]
if searchResult:
self.send_response(200)
else:
self.send_response(403)
self.end_headers()
if searchResult:
data = json.dumps(searchResult)
self.wfile.write(data.encode('utf8'))
def handleDocumentsRequest(self):
self.send_response(200)
self.send_header('Content-Type', 'application/json')
self.end_headers()
data = json.dumps([i for i in getDocuments()])
self.wfile.write(data.encode('utf8'))
def do_POST(self):
if re.search('/search', self.path):
self.handleSearchRequest()
elif re.search('/reindex', self.path):
self.handleReindexRequest()
elif re.search('/image', self.path):
self.handleImageRequest()
else:
self.send_response(403)
self.end_headers()
def do_GET(self):
if self.path == '/':
self.path = '/Main.html'
elif re.search('/documents', self.path):
self.handleDocumentsRequest()
else:
return SimpleHTTPRequestHandler.do_GET(self)
#run with "python Server.py 8000 127.0.0.1"
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='HTTP Server')
parser.add_argument('port', type=int, default=8000, help='Listening port for HTTP Server')
parser.add_argument('ip', default="127.0.0.1", help='HTTP Server IP')
args = parser.parse_args()
server = HTTPServer((args.ip, args.port), HTTPRequestHandler)
print('HTTP Server Running...........')
server.serve_forever()