Proof of concept Search WebApp to show how to implement doc. search
This adds the Proof of concept WebApp to show how to glue together the Solr search platform with COOL server with "convert-to" and "render-search-result" REST services and combine everything into a document search solution. Signed-off-by: Tomaž Vajngerl <tomaz.vajngerl@collabora.co.uk> Change-Id: Iea3a2f6e2afee090bc7a27648390025d2a8c94d8pull/3150/head
parent
b31eb2ab92
commit
02c60302b3
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,70 @@
|
|||
<!DOCTYPE html>
|
||||
<html lang="en" ng-app="SearchApp">
|
||||
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||
<title>Search - Proof of Concept</title>
|
||||
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.1.0/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-KyZXEAg3QhqLMpG8r+8fhAXLRk2vvoC2f3B09zVXn8CA5QIVfZOJ3BCsw2P0p/We" crossorigin="anonymous">
|
||||
<script src="https://ajax.googleapis.com/ajax/libs/angularjs/1.8.2/angular.min.js"></script>
|
||||
<script src="https://ajax.googleapis.com/ajax/libs/angularjs/1.8.2/angular-resource.min.js"></script>
|
||||
<script src="Main.js"></script>
|
||||
</head>
|
||||
|
||||
<body ng-controller="SearchMainController">
|
||||
<div class="container">
|
||||
<div class="row">
|
||||
<div class="col">
|
||||
<form class="row g-1 justify-content-md-center">
|
||||
<div class="col-sm-6">
|
||||
<input class="form-control" type="text" ng-model="searchString" placeholder="Search string..." >
|
||||
</div>
|
||||
<div class="col-auto">
|
||||
<button type="button" ng-click="searchClicked()" ng-disabled="searchDisabled()" class="btn btn-primary mb-3">Search</button>
|
||||
</div>
|
||||
<div class="col-auto">
|
||||
<button type="button" ng-click="reindexClicked()" class="btn btn-primary">Re-index Documents</button>
|
||||
</div>
|
||||
</form>
|
||||
</div>
|
||||
</div>
|
||||
<div class="row" ng-show="searchResult.length > 0">
|
||||
<div class="col"><b>Search Result</b></div>
|
||||
<table class="table table-vcenter table-striped table-hover table-bordered">
|
||||
<thead>
|
||||
<tr>
|
||||
<th class="col-xs-6">Filename</th>
|
||||
<th class="col-xs-6">Image</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<tr class="table-repeat" ng-repeat="result in searchResult">
|
||||
<td><a href="{{result.href}}">{{result.filename}}</td>
|
||||
<td><img data-ng-src="data:image/png;base64,{{result.image}}"/></td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<button type="button" ng-click="clearSearchResults()" class="btn btn-primary">Clear</button>
|
||||
</div>
|
||||
<div class="row" ng-show="searchResult.length == 0">
|
||||
<table class="table table-vcenter table-striped table-hover table-bordered">
|
||||
<thead>
|
||||
<tr>
|
||||
<th class="col-xs-6">Filename</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<tr class="table-repeat" ng-repeat="document in documents">
|
||||
<td><a href="{{document.href}}">{{document.name}}</a></td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
<div class="row">
|
||||
<p>Status: {{status}}</p>
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,93 @@
|
|||
var searchApp = angular.module('SearchApp', ['ngResource']);
|
||||
|
||||
searchApp.factory('SearchAPI', ['$http',
|
||||
function ($http) {
|
||||
return {
|
||||
getDocuments: function () {
|
||||
var promise = $http.get('/documents').then(function(response) {
|
||||
return response.data;
|
||||
}, function (error) {
|
||||
return [];
|
||||
})
|
||||
return promise;
|
||||
},
|
||||
reindexDocuments: function () {
|
||||
var promise = $http.post('/reindex').then(function(response) {
|
||||
return true;
|
||||
}, function (error) {
|
||||
return false;
|
||||
})
|
||||
return promise;
|
||||
},
|
||||
search: function (query) {
|
||||
var queryJson = { "query" : query };
|
||||
var promise = $http.post('/search', queryJson).then(function(response) {
|
||||
return response.data;
|
||||
}, function (error) {
|
||||
return null;
|
||||
})
|
||||
return promise;
|
||||
},
|
||||
getResultImage: function(jsonResult) {
|
||||
var promise = $http.post('/image', jsonResult).then(function(response) {
|
||||
return response.data;
|
||||
}, function (error) {
|
||||
return null;
|
||||
})
|
||||
return promise;
|
||||
}
|
||||
}
|
||||
}
|
||||
]);
|
||||
|
||||
searchApp.controller('SearchMainController', ['$scope', 'SearchAPI', function($scope, SearchAPI) {
|
||||
|
||||
$scope.searchResult = []
|
||||
|
||||
$scope.searchString = "";
|
||||
$scope.status = "Ready";
|
||||
|
||||
$scope.reindexClicked = function() {
|
||||
$scope.status = "Reindexing....";
|
||||
|
||||
SearchAPI.reindexDocuments().then(function(data) {
|
||||
$scope.status = "Finished reindexing";
|
||||
});
|
||||
};
|
||||
|
||||
$scope.clearSearchResults = function() {
|
||||
$scope.searchResult = []
|
||||
}
|
||||
|
||||
$scope.searchClicked = function() {
|
||||
$scope.searchResult = []
|
||||
SearchAPI.search($scope.searchString).then(function(jsonResult) {
|
||||
$scope.status = "Search finished " + jsonResult.length;
|
||||
for (let i = 0; i < jsonResult.length; i++) {
|
||||
let result = jsonResult[i];
|
||||
$scope.searchResult[i] = result;
|
||||
SearchAPI.getResultImage(result).then(function(image) {
|
||||
result.image = image;
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
$scope.status = "Searching... " + $scope.searchString;
|
||||
};
|
||||
|
||||
$scope.searchDisabled = function() {
|
||||
return $scope.searchString == "";
|
||||
};
|
||||
|
||||
$scope.documents = []
|
||||
|
||||
SearchAPI.getDocuments().then(function(data) {
|
||||
$scope.documents = data;
|
||||
});
|
||||
|
||||
$scope.documentSize = function() {
|
||||
return $scope.documents.length;
|
||||
};
|
||||
|
||||
}]);
|
||||
|
|
@ -0,0 +1,33 @@
|
|||
Searching and indexing example APP
|
||||
**********************************
|
||||
|
||||
Main.js and Main.html - HTML/JS client side
|
||||
Server.py - HTTP Server and server side processing via. REST API
|
||||
|
||||
Configuration
|
||||
*************
|
||||
|
||||
Open "Server.py" and change the COOL and Solr server URL ("coolServerUrl" and "solrServerUrl") if they are different
|
||||
than localhost and default ports.
|
||||
|
||||
The "documentPath" constant is the root location of the documents (relative where Server.py was started).
|
||||
|
||||
The "solrCollectionName" constant is the collection name where Solr should store the indices.
|
||||
|
||||
The "coolInstance" is the URL to the COOL instance, which is used to open a document.
|
||||
|
||||
The collection needs to be created in Solr, if it doesn't yet exists with (from Solr root):
|
||||
./bin/solr create -c <collection name>
|
||||
|
||||
For exmaple:
|
||||
./bin/solr create -c documents
|
||||
|
||||
HTTP Server
|
||||
***********
|
||||
|
||||
Run http server on locally with:
|
||||
"python Server.py 8000 127.0.0.1"
|
||||
|
||||
Then connect from the web browser to "http::/localhost:8000"
|
||||
|
||||
First time - run "Re-Index Documents" or nothing will be found
|
|
@ -0,0 +1,226 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
from http.server import SimpleHTTPRequestHandler, HTTPServer
|
||||
from urllib import parse
|
||||
from urllib.request import *
|
||||
|
||||
import argparse
|
||||
import re
|
||||
import json
|
||||
import os
|
||||
import requests
|
||||
import xml.etree.ElementTree as ET
|
||||
import base64
|
||||
|
||||
# Configuration
|
||||
coolServerUrl = "http://localhost:9980"
|
||||
solrServerUrl = "http://localhost:8983"
|
||||
|
||||
documentPath = "Docs/"
|
||||
coolInstance = coolServerUrl + "/loleaflet/f6d368a0a/loleaflet.html"
|
||||
solrCollectionName = "documents"
|
||||
|
||||
# Templates
|
||||
|
||||
solrSelectUrl = "{}/solr/{}/select".format(solrServerUrl, solrCollectionName)
|
||||
solrUpdateUrl = "{}/solr/{}/update?commit=true".format(solrServerUrl, solrCollectionName)
|
||||
|
||||
# Transform the LO indexing XML structure to Solr structure
|
||||
def transformToSolrFormat(xmlContent, filename):
|
||||
root = ET.fromstring(xmlContent)
|
||||
builder = ET.TreeBuilder()
|
||||
builder.start("add", {})
|
||||
|
||||
for entry in root:
|
||||
if entry.tag == 'paragraph' or entry.tag == 'object':
|
||||
builder.start("doc", {})
|
||||
|
||||
builder.start("field", {"name" : "filename"})
|
||||
builder.data(filename)
|
||||
builder.end("filed")
|
||||
|
||||
builder.start("field", {"name" : "type"})
|
||||
builder.data(entry.tag)
|
||||
builder.end("field")
|
||||
|
||||
for attribute in entry.attrib:
|
||||
builder.start("field", {"name" : attribute})
|
||||
builder.data(entry.attrib[attribute])
|
||||
builder.end("field")
|
||||
|
||||
builder.start("field", {"name" : "content"})
|
||||
builder.data(entry.text)
|
||||
builder.end("field")
|
||||
|
||||
builder.end("doc")
|
||||
builder.end("add")
|
||||
|
||||
et = ET.ElementTree(builder.close())
|
||||
ET.indent(et, space=" ", level=0)
|
||||
return ET.tostring(et.getroot(), encoding='utf-8', xml_declaration=True)
|
||||
|
||||
# Create Solr XML to remove all entries from the database
|
||||
def createSolrDeleteXml():
|
||||
builder = ET.TreeBuilder()
|
||||
builder.start("update", {})
|
||||
builder.start("delete", {})
|
||||
builder.start("query", {})
|
||||
builder.data("*:*")
|
||||
builder.end("query")
|
||||
builder.end("delete")
|
||||
builder.end("update")
|
||||
|
||||
et = ET.ElementTree(builder.close())
|
||||
ET.indent(et, space=" ", level=0)
|
||||
return ET.tostring(et.getroot(), encoding='utf-8', xml_declaration=True)
|
||||
|
||||
# Calls "Convert To - Indexing XML" service on COOL Server
|
||||
def callConvertToIndexingXml(filename, filepath):
|
||||
filesDict = {
|
||||
'data': (filepath, open(filepath, 'rb'), None, {})
|
||||
}
|
||||
response = requests.post("{}/lool/convert-to/xml".format(coolServerUrl), files=filesDict)
|
||||
if response.ok:
|
||||
return response.content
|
||||
return None
|
||||
|
||||
# Reindex all documents
|
||||
def runReindexProcess():
|
||||
headers = {'Content-Type' : 'text/xml'}
|
||||
|
||||
# remove existing entries from the database
|
||||
requests.post(solrUpdateUrl, data=createSolrDeleteXml(), headers=headers)
|
||||
|
||||
# add the new indices into SOLR server
|
||||
for document in getDocuments():
|
||||
filename = document['name']
|
||||
xmlContent = callConvertToIndexingXml(filename, documentPath + filename)
|
||||
if xmlContent:
|
||||
# add indexing XML values
|
||||
headers = {'Content-Type' : 'text/xml'}
|
||||
solrTransformed = transformToSolrFormat(xmlContent, filename)
|
||||
response = requests.post(solrUpdateUrl, data=solrTransformed, headers=headers)
|
||||
if not response.ok:
|
||||
return False
|
||||
return True
|
||||
|
||||
# Search/Query on Solr
|
||||
def callQueryServiceOnSolr(jsonString):
|
||||
searchStructure = json.loads(jsonString)
|
||||
query = searchStructure['query']
|
||||
|
||||
response = requests.get("{}?rows=50&q=content:{}".format(solrSelectUrl, query))
|
||||
result = response.json()
|
||||
responseBody = result['response']
|
||||
if responseBody['numFound'] > 0:
|
||||
for document in responseBody['docs']:
|
||||
type = document['type'][0]
|
||||
filename = document['filename'][0]
|
||||
href = "{}?file_path=file://{}".format(coolInstance, os.path.abspath(documentPath + filename))
|
||||
if type == "paragraph":
|
||||
returnMap = {
|
||||
'filename' : filename,
|
||||
'href' : href,
|
||||
'type' : document['type'][0],
|
||||
'index' : document['index'][0],
|
||||
'node_type' : document['node_type'][0],
|
||||
'content' : document['content'][0]
|
||||
}
|
||||
if 'object_name' in document:
|
||||
returnMap['object_name'] = document['object_name'][0]
|
||||
yield returnMap
|
||||
|
||||
# Gets all the available documents contained in the document path
|
||||
def getDocuments():
|
||||
with os.scandir(documentPath) as entries:
|
||||
for entry in entries:
|
||||
if entry.is_file():
|
||||
yield {
|
||||
"name" : entry.name,
|
||||
"href" : "{}?file_path=file://{}".format(coolInstance, os.path.abspath(documentPath + entry.name))
|
||||
}
|
||||
|
||||
# Calls "Render Search Result" service on COOL Server
|
||||
# Input is search result and the document, and return the rendered image
|
||||
def callRenderImageService(resultJsonString):
|
||||
result = json.loads(resultJsonString)
|
||||
filename = result['filename']
|
||||
# Enclose json with [] - as the server supports more search results, which are then combined
|
||||
resultJsonProcessed = '[ ' + resultJsonString.decode('utf-8') + ' ]'
|
||||
filesDict = {
|
||||
"document": (filename, open(documentPath + filename, 'rb'), None, {}),
|
||||
"result" : ("json", resultJsonProcessed, None, {})
|
||||
}
|
||||
response = requests.post("{}/lool/render-search-result".format(coolServerUrl), files=filesDict)
|
||||
return base64.b64encode(response.content)
|
||||
|
||||
# HTTP Server - Handle HTTP requests
|
||||
class HTTPRequestHandler(SimpleHTTPRequestHandler):
|
||||
def handleImageRequest(self):
|
||||
jsonString = self.rfile.read(int(self.headers['Content-Length']))
|
||||
imageBase64 = callRenderImageService(jsonString)
|
||||
if imageBase64:
|
||||
self.send_response(200)
|
||||
else:
|
||||
self.send_response(403)
|
||||
self.end_headers()
|
||||
if imageBase64:
|
||||
self.wfile.write(imageBase64)
|
||||
|
||||
def handleReindexRequest(self):
|
||||
if runReindexProcess():
|
||||
self.send_response(200)
|
||||
else:
|
||||
self.send_response(403)
|
||||
self.end_headers()
|
||||
|
||||
def handleSearchRequest(self):
|
||||
jsonString = self.rfile.read(int(self.headers['Content-Length']))
|
||||
searchResult = [i for i in callQueryServiceOnSolr(jsonString)]
|
||||
if searchResult:
|
||||
self.send_response(200)
|
||||
else:
|
||||
self.send_response(403)
|
||||
self.end_headers()
|
||||
|
||||
if searchResult:
|
||||
data = json.dumps(searchResult)
|
||||
self.wfile.write(data.encode('utf8'))
|
||||
|
||||
def handleDocumentsRequest(self):
|
||||
self.send_response(200)
|
||||
self.send_header('Content-Type', 'application/json')
|
||||
self.end_headers()
|
||||
data = json.dumps([i for i in getDocuments()])
|
||||
self.wfile.write(data.encode('utf8'))
|
||||
|
||||
def do_POST(self):
|
||||
if re.search('/search', self.path):
|
||||
self.handleSearchRequest()
|
||||
elif re.search('/reindex', self.path):
|
||||
self.handleReindexRequest()
|
||||
elif re.search('/image', self.path):
|
||||
self.handleImageRequest()
|
||||
else:
|
||||
self.send_response(403)
|
||||
self.end_headers()
|
||||
|
||||
def do_GET(self):
|
||||
if self.path == '/':
|
||||
self.path = '/Main.html'
|
||||
elif re.search('/documents', self.path):
|
||||
self.handleDocumentsRequest()
|
||||
else:
|
||||
return SimpleHTTPRequestHandler.do_GET(self)
|
||||
|
||||
#run with "python Server.py 8000 127.0.0.1"
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='HTTP Server')
|
||||
parser.add_argument('port', type=int, default=8000, help='Listening port for HTTP Server')
|
||||
parser.add_argument('ip', default="127.0.0.1", help='HTTP Server IP')
|
||||
args = parser.parse_args()
|
||||
|
||||
server = HTTPServer((args.ip, args.port), HTTPRequestHandler)
|
||||
print('HTTP Server Running...........')
|
||||
server.serve_forever()
|
Loading…
Reference in New Issue