Compare commits

...

16 Commits

Author SHA1 Message Date
Mishig Davaadorj 24f0899150 update SIMILARITY_SCORE_THRESHOLD 2023-11-08 14:21:18 +01:00
Mishig Davaadorj 00ed591093 more prompt engineering 2023-11-08 14:00:24 +01:00
Mishig Davaadorj d5c3bb76ea error handling 2023-11-08 11:52:54 +01:00
Mishig Davaadorj bc6484d55f clearer typing 2023-11-08 11:37:12 +01:00
Mishig Davaadorj eecbc31853 embed pages of 5 until close text chunk is found 2023-11-08 11:37:05 +01:00
Mishig Davaadorj 199a10afdd Merge branch 'main' into websearch_4 2023-11-07 16:13:59 +01:00
Mishig Davaadorj 6ee0107d67 more prompt engineering 2023-10-25 17:39:54 +02:00
Mishig Davaadorj e0051369e7 more prompt engineering 2023-10-25 17:35:52 +02:00
Mishig Davaadorj fad35eb734 use updated tfjs model 2023-10-24 11:53:34 +02:00
Mishig Davaadorj a5f9aa3075 correct join seperator 2023-10-24 11:53:34 +02:00
Mishig Davaadorj 1c0ec575c6 chore 2023-10-24 11:53:34 +02:00
Mishig Davaadorj 09b85ab8d4 Implement new markdown sliding paragraph chunker 2023-10-24 11:53:34 +02:00
Mishig Davaadorj 2f49002277 add new dep types 2023-10-24 11:53:34 +02:00
Mishig Davaadorj 2b5231a091 mv prettier from devDep to Dep 2023-10-24 11:53:34 +02:00
Mishig Davaadorj 2cd1f1aff9 add new deps 2023-10-24 11:53:34 +02:00
Mishig Davaadorj 8c7fd65237 rename function 2023-10-24 11:53:34 +02:00
11 changed files with 545 additions and 94 deletions

2
.gitignore vendored
View File

@ -11,3 +11,5 @@ vite.config.js.timestamp-*
vite.config.ts.timestamp-*
SECRET_CONFIG
.idea
# directory for local transformers.js models
local-models

349
package-lock.json generated
View File

@ -13,20 +13,23 @@
"@xenova/transformers": "^2.6.0",
"autoprefixer": "^10.4.14",
"aws4fetch": "^1.0.17",
"cheerio": "^1.0.0-rc.12",
"date-fns": "^2.29.3",
"dotenv": "^16.0.3",
"handlebars": "^4.7.8",
"highlight.js": "^11.7.0",
"jsdom": "^22.0.0",
"marked": "^4.3.0",
"mongodb": "^5.8.0",
"nanoid": "^4.0.2",
"openid-client": "^5.4.2",
"parquetjs": "^0.11.2",
"postcss": "^8.4.31",
"prettier": "^2.8.0",
"serpapi": "^1.1.1",
"tailwind-scrollbar": "^3.0.0",
"tailwindcss": "^3.3.1",
"turndown": "^7.1.2",
"turndown-plugin-gfm": "^1.0.2",
"zod": "^3.22.3"
},
"devDependencies": {
@ -38,13 +41,14 @@
"@types/jsdom": "^21.1.1",
"@types/marked": "^4.0.8",
"@types/parquetjs": "^0.10.3",
"@types/prettier": "^2.7.3",
"@types/turndown": "^5.0.2",
"@typescript-eslint/eslint-plugin": "^6.x",
"@typescript-eslint/parser": "^6.x",
"eslint": "^8.28.0",
"eslint-config-prettier": "^8.5.0",
"eslint-plugin-svelte": "^2.30.0",
"marked-katex-extension": "^3.0.6",
"prettier": "^2.8.0",
"prettier-plugin-svelte": "^2.10.1",
"prettier-plugin-tailwindcss": "^0.2.7",
"svelte": "^4.0.0",
@ -1026,6 +1030,9 @@
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/@tootallnate/once/-/once-2.0.0.tgz",
"integrity": "sha512-XCuKFP5PS55gnMVu3dty8KPatLqUoy/ZYzDzAGCQ8JNFCkLXzmI7vNHCR+XpbZaMWQK/vQubr7PkYq8g470J/A==",
"dev": true,
"optional": true,
"peer": true,
"engines": {
"node": ">= 10"
}
@ -1138,6 +1145,12 @@
"@types/node-int64": "*"
}
},
"node_modules/@types/prettier": {
"version": "2.7.3",
"resolved": "https://registry.npmjs.org/@types/prettier/-/prettier-2.7.3.tgz",
"integrity": "sha512-+68kP9yzs4LMp7VNh8gdzMSPZFL44MLGqiHWvttYJe+6qnuVr4Ek9wSBQoveqY/r+LwjCcU29kNVkidwim+kYA==",
"dev": true
},
"node_modules/@types/pug": {
"version": "2.0.7",
"resolved": "https://registry.npmjs.org/@types/pug/-/pug-2.0.7.tgz",
@ -1162,6 +1175,12 @@
"integrity": "sha512-Q5vtl1W5ue16D+nIaW8JWebSSraJVlK+EthKn7e7UcD4KWsaSJ8BqGPXNaPghgtcn/fhvrN17Tv8ksUsQpiplw==",
"dev": true
},
"node_modules/@types/turndown": {
"version": "5.0.2",
"resolved": "https://registry.npmjs.org/@types/turndown/-/turndown-5.0.2.tgz",
"integrity": "sha512-ghbjIyvMSQn/UGEuQJD6C4DfbokyYqGRhNAetWH02qnuRfvRZz9qTOG9e0RPkVqGsjv+YsjF3gRp7yFKvc/1PA==",
"dev": true
},
"node_modules/@types/webidl-conversions": {
"version": "7.0.0",
"resolved": "https://registry.npmjs.org/@types/webidl-conversions/-/webidl-conversions-7.0.0.tgz",
@ -1476,7 +1495,10 @@
"node_modules/abab": {
"version": "2.0.6",
"resolved": "https://registry.npmjs.org/abab/-/abab-2.0.6.tgz",
"integrity": "sha512-j2afSsaIENvHZN2B8GOpF566vZ5WVk5opAiMTvWgaQT8DkbOqsTfvNAvHoRGU2zzP8cPoqys+xHTRDWW8L+/BA=="
"integrity": "sha512-j2afSsaIENvHZN2B8GOpF566vZ5WVk5opAiMTvWgaQT8DkbOqsTfvNAvHoRGU2zzP8cPoqys+xHTRDWW8L+/BA==",
"dev": true,
"optional": true,
"peer": true
},
"node_modules/acorn": {
"version": "8.10.0",
@ -1512,6 +1534,9 @@
"version": "6.0.2",
"resolved": "https://registry.npmjs.org/agent-base/-/agent-base-6.0.2.tgz",
"integrity": "sha512-RZNwNclF7+MS/8bDg70amg32dyeZGZxiDuQmZxKLAlQjr3jGyLx+4Kkk58UO7D2QdgFIQCovuSuZESne6RG6XQ==",
"dev": true,
"optional": true,
"peer": true,
"dependencies": {
"debug": "4"
},
@ -1617,7 +1642,10 @@
"node_modules/asynckit": {
"version": "0.4.0",
"resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz",
"integrity": "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q=="
"integrity": "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==",
"dev": true,
"optional": true,
"peer": true
},
"node_modules/autoprefixer": {
"version": "10.4.14",
@ -1724,6 +1752,11 @@
"integrity": "sha512-DRQrD6gJyy8FbiE4s+bDoXS9hiW3Vbx5uCdwvcCf3zLHL+Iv7LtGHLpr+GZV8rHG8tK766FGYBwRbu8pELTt+w==",
"dev": true
},
"node_modules/boolbase": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz",
"integrity": "sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww=="
},
"node_modules/brace-expansion": {
"version": "1.1.11",
"resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz",
@ -1933,6 +1966,42 @@
"node": "*"
}
},
"node_modules/cheerio": {
"version": "1.0.0-rc.12",
"resolved": "https://registry.npmjs.org/cheerio/-/cheerio-1.0.0-rc.12.tgz",
"integrity": "sha512-VqR8m68vM46BNnuZ5NtnGBKIE/DfN0cRIzg9n40EIq9NOv90ayxLBXA8fXC5gquFRGJSTRqBq25Jt2ECLR431Q==",
"dependencies": {
"cheerio-select": "^2.1.0",
"dom-serializer": "^2.0.0",
"domhandler": "^5.0.3",
"domutils": "^3.0.1",
"htmlparser2": "^8.0.1",
"parse5": "^7.0.0",
"parse5-htmlparser2-tree-adapter": "^7.0.0"
},
"engines": {
"node": ">= 6"
},
"funding": {
"url": "https://github.com/cheeriojs/cheerio?sponsor=1"
}
},
"node_modules/cheerio-select": {
"version": "2.1.0",
"resolved": "https://registry.npmjs.org/cheerio-select/-/cheerio-select-2.1.0.tgz",
"integrity": "sha512-9v9kG0LvzrlcungtnJtpGNxY+fzECQKhK4EGJX2vByejiMX84MFNQw4UxPJl3bFbTMw+Dfs37XaIkCwTZfLh4g==",
"dependencies": {
"boolbase": "^1.0.0",
"css-select": "^5.1.0",
"css-what": "^6.1.0",
"domelementtype": "^2.3.0",
"domhandler": "^5.0.3",
"domutils": "^3.0.1"
},
"funding": {
"url": "https://github.com/sponsors/fb55"
}
},
"node_modules/chokidar": {
"version": "3.5.3",
"resolved": "https://registry.npmjs.org/chokidar/-/chokidar-3.5.3.tgz",
@ -2038,6 +2107,9 @@
"version": "1.0.8",
"resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz",
"integrity": "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==",
"dev": true,
"optional": true,
"peer": true,
"dependencies": {
"delayed-stream": "~1.0.0"
},
@ -2112,6 +2184,21 @@
"node": ">= 8"
}
},
"node_modules/css-select": {
"version": "5.1.0",
"resolved": "https://registry.npmjs.org/css-select/-/css-select-5.1.0.tgz",
"integrity": "sha512-nwoRF1rvRRnnCqqY7updORDsuqKzqYJ28+oSMaJMMgOauh3fvwHqMS7EZpIPqK8GL+g9mKxF1vP/ZjSeNjEVHg==",
"dependencies": {
"boolbase": "^1.0.0",
"css-what": "^6.1.0",
"domhandler": "^5.0.2",
"domutils": "^3.0.1",
"nth-check": "^2.0.1"
},
"funding": {
"url": "https://github.com/sponsors/fb55"
}
},
"node_modules/css-tree": {
"version": "2.3.1",
"resolved": "https://registry.npmjs.org/css-tree/-/css-tree-2.3.1.tgz",
@ -2125,6 +2212,17 @@
"node": "^10 || ^12.20.0 || ^14.13.0 || >=15.0.0"
}
},
"node_modules/css-what": {
"version": "6.1.0",
"resolved": "https://registry.npmjs.org/css-what/-/css-what-6.1.0.tgz",
"integrity": "sha512-HTUrgRJ7r4dsZKU6GjmpfRK1O76h97Z8MfS1G0FozR+oF2kG6Vfe8JE6zwrkbxigziPHinCJ+gCPjA9EaBDtRw==",
"engines": {
"node": ">= 6"
},
"funding": {
"url": "https://github.com/sponsors/fb55"
}
},
"node_modules/cssesc": {
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/cssesc/-/cssesc-3.0.0.tgz",
@ -2140,6 +2238,9 @@
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/cssstyle/-/cssstyle-3.0.0.tgz",
"integrity": "sha512-N4u2ABATi3Qplzf0hWbVCdjenim8F3ojEXpBDF5hBpjzW182MjNGLqfmQ0SkSPeQ+V86ZXgeH8aXj6kayd4jgg==",
"dev": true,
"optional": true,
"peer": true,
"dependencies": {
"rrweb-cssom": "^0.6.0"
},
@ -2151,6 +2252,9 @@
"version": "4.0.0",
"resolved": "https://registry.npmjs.org/data-urls/-/data-urls-4.0.0.tgz",
"integrity": "sha512-/mMTei/JXPqvFqQtfyTowxmJVwr2PVAeCcDxyFf6LhoOu/09TX2OX3kb2wzi4DMXcfj4OItwDOnhl5oziPnT6g==",
"dev": true,
"optional": true,
"peer": true,
"dependencies": {
"abab": "^2.0.6",
"whatwg-mimetype": "^3.0.0",
@ -2164,6 +2268,9 @@
"version": "4.1.1",
"resolved": "https://registry.npmjs.org/tr46/-/tr46-4.1.1.tgz",
"integrity": "sha512-2lv/66T7e5yNyhAAC4NaKe5nVavzuGJQVVtRYLyQ2OI8tsJ61PMLlelehb0wi2Hx6+hT/OJUWZcw8MjlSRnxvw==",
"dev": true,
"optional": true,
"peer": true,
"dependencies": {
"punycode": "^2.3.0"
},
@ -2175,6 +2282,9 @@
"version": "12.0.1",
"resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-12.0.1.tgz",
"integrity": "sha512-Ed/LrqB8EPlGxjS+TrsXcpUond1mhccS3pchLhzSgPCnTimUCKj3IZE75pAs5m6heB2U2TMerKFUXheyHY+VDQ==",
"dev": true,
"optional": true,
"peer": true,
"dependencies": {
"tr46": "^4.1.1",
"webidl-conversions": "^7.0.0"
@ -2211,6 +2321,7 @@
"version": "4.3.4",
"resolved": "https://registry.npmjs.org/debug/-/debug-4.3.4.tgz",
"integrity": "sha512-PRWFHuSU3eDtQJPvnNY7Jcket1j0t5OuOsFzPPzsekD52Zl8qUfFIPEiswXqIvHWGVHOgX+7G/vCNNhehwxfkQ==",
"dev": true,
"dependencies": {
"ms": "2.1.2"
},
@ -2226,7 +2337,10 @@
"node_modules/decimal.js": {
"version": "10.4.3",
"resolved": "https://registry.npmjs.org/decimal.js/-/decimal.js-10.4.3.tgz",
"integrity": "sha512-VBBaLc1MgL5XpzgIP7ny5Z6Nx3UrRkIViUkPUdtl9aya5amy3De1gsUUSB1g3+3sExYNjCAsAznmukyxCb1GRA=="
"integrity": "sha512-VBBaLc1MgL5XpzgIP7ny5Z6Nx3UrRkIViUkPUdtl9aya5amy3De1gsUUSB1g3+3sExYNjCAsAznmukyxCb1GRA==",
"dev": true,
"optional": true,
"peer": true
},
"node_modules/decompress-response": {
"version": "6.0.0",
@ -2281,6 +2395,9 @@
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz",
"integrity": "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==",
"dev": true,
"optional": true,
"peer": true,
"engines": {
"node": ">=0.4.0"
}
@ -2360,10 +2477,37 @@
"node": ">=6.0.0"
}
},
"node_modules/dom-serializer": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-2.0.0.tgz",
"integrity": "sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==",
"dependencies": {
"domelementtype": "^2.3.0",
"domhandler": "^5.0.2",
"entities": "^4.2.0"
},
"funding": {
"url": "https://github.com/cheeriojs/dom-serializer?sponsor=1"
}
},
"node_modules/domelementtype": {
"version": "2.3.0",
"resolved": "https://registry.npmjs.org/domelementtype/-/domelementtype-2.3.0.tgz",
"integrity": "sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw==",
"funding": [
{
"type": "github",
"url": "https://github.com/sponsors/fb55"
}
]
},
"node_modules/domexception": {
"version": "4.0.0",
"resolved": "https://registry.npmjs.org/domexception/-/domexception-4.0.0.tgz",
"integrity": "sha512-A2is4PLG+eeSfoTMA95/s4pvAoSo2mKtiM5jlHkAVewmiO8ISFTFKZjH7UAM1Atli/OT/7JHOrJRJiMKUZKYBw==",
"dev": true,
"optional": true,
"peer": true,
"dependencies": {
"webidl-conversions": "^7.0.0"
},
@ -2371,6 +2515,38 @@
"node": ">=12"
}
},
"node_modules/domhandler": {
"version": "5.0.3",
"resolved": "https://registry.npmjs.org/domhandler/-/domhandler-5.0.3.tgz",
"integrity": "sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==",
"dependencies": {
"domelementtype": "^2.3.0"
},
"engines": {
"node": ">= 4"
},
"funding": {
"url": "https://github.com/fb55/domhandler?sponsor=1"
}
},
"node_modules/domino": {
"version": "2.1.6",
"resolved": "https://registry.npmjs.org/domino/-/domino-2.1.6.tgz",
"integrity": "sha512-3VdM/SXBZX2omc9JF9nOPCtDaYQ67BGp5CoLpIQlO2KCAPETs8TcDHacF26jXadGbvUteZzRTeos2fhID5+ucQ=="
},
"node_modules/domutils": {
"version": "3.1.0",
"resolved": "https://registry.npmjs.org/domutils/-/domutils-3.1.0.tgz",
"integrity": "sha512-H78uMmQtI2AhgDJjWeQmHwJJ2bLPD3GMmO7Zja/ZZh84wkm+4ut+IUnUdRa8uCGX88DiVx1j6FRe1XfxEgjEZA==",
"dependencies": {
"dom-serializer": "^2.0.0",
"domelementtype": "^2.3.0",
"domhandler": "^5.0.3"
},
"funding": {
"url": "https://github.com/fb55/domutils?sponsor=1"
}
},
"node_modules/dotenv": {
"version": "16.0.3",
"resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.0.3.tgz",
@ -2844,6 +3020,9 @@
"version": "4.0.0",
"resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.0.tgz",
"integrity": "sha512-ETEklSGi5t0QMZuiXoA/Q6vcnxcLQP5vdugSpuAyi6SVGi2clPPp+xgEhuMaHC+zGgn31Kd235W35f7Hykkaww==",
"dev": true,
"optional": true,
"peer": true,
"dependencies": {
"asynckit": "^0.4.0",
"combined-stream": "^1.0.8",
@ -2894,9 +3073,9 @@
"integrity": "sha512-yIovAzMX49sF8Yl58fSCWJ5svSLuaibPxXQJFLmBObTuCr0Mf1KiPopGM9NiFjiYBCbfaa2Fh6breQ6ANVTI0A=="
},
"node_modules/get-func-name": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/get-func-name/-/get-func-name-2.0.0.tgz",
"integrity": "sha512-Hm0ixYtaSZ/V7C8FJrtZIuBBI+iSgL+1Aq82zSu8VQNB4S3Gk8e7Qs3VwBDJAhmRZcFqkl3tQu36g/Foh5I5ig==",
"version": "2.0.2",
"resolved": "https://registry.npmjs.org/get-func-name/-/get-func-name-2.0.2.tgz",
"integrity": "sha512-8vXOvuE167CtIc3OyItco7N/dpRtBbYOsPsXCz7X/PMnlGjYjSGuZJgM1Y7mmew7BKf9BqvLX2tnOVy1BBUsxQ==",
"dev": true,
"engines": {
"node": "*"
@ -3077,6 +3256,9 @@
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/html-encoding-sniffer/-/html-encoding-sniffer-3.0.0.tgz",
"integrity": "sha512-oWv4T4yJ52iKrufjnyZPkrN0CH3QnrUqdB6In1g5Fe1mia8GmF36gnfNySxoZtxD5+NmYw1EElVXiBk93UeskA==",
"dev": true,
"optional": true,
"peer": true,
"dependencies": {
"whatwg-encoding": "^2.0.0"
},
@ -3084,10 +3266,31 @@
"node": ">=12"
}
},
"node_modules/htmlparser2": {
"version": "8.0.2",
"resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-8.0.2.tgz",
"integrity": "sha512-GYdjWKDkbRLkZ5geuHs5NY1puJ+PXwP7+fHPRz06Eirsb9ugf6d8kkXav6ADhcODhFFPMIXyxkxSuMf3D6NCFA==",
"funding": [
"https://github.com/fb55/htmlparser2?sponsor=1",
{
"type": "github",
"url": "https://github.com/sponsors/fb55"
}
],
"dependencies": {
"domelementtype": "^2.3.0",
"domhandler": "^5.0.3",
"domutils": "^3.0.1",
"entities": "^4.4.0"
}
},
"node_modules/http-proxy-agent": {
"version": "5.0.0",
"resolved": "https://registry.npmjs.org/http-proxy-agent/-/http-proxy-agent-5.0.0.tgz",
"integrity": "sha512-n2hY8YdoRE1i7r6M0w9DIw5GgZN0G25P8zLCRQ8rjXtTU3vsNFBI/vWK/UIeE6g5MUUz6avwAPXmL6Fy9D/90w==",
"dev": true,
"optional": true,
"peer": true,
"dependencies": {
"@tootallnate/once": "2",
"agent-base": "6",
@ -3101,6 +3304,9 @@
"version": "5.0.1",
"resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-5.0.1.tgz",
"integrity": "sha512-dFcAjpTQFgoLMzC2VwU+C/CbS7uRL0lWmxDITmqm7C+7F0Odmj6s9l6alZc6AELXhrnggM2CeWSXHGOdX2YtwA==",
"dev": true,
"optional": true,
"peer": true,
"dependencies": {
"agent-base": "6",
"debug": "4"
@ -3122,6 +3328,9 @@
"version": "0.6.3",
"resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.3.tgz",
"integrity": "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==",
"dev": true,
"optional": true,
"peer": true,
"dependencies": {
"safer-buffer": ">= 2.1.2 < 3.0.0"
},
@ -3298,7 +3507,10 @@
"node_modules/is-potential-custom-element-name": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/is-potential-custom-element-name/-/is-potential-custom-element-name-1.0.1.tgz",
"integrity": "sha512-bCYeRA2rVibKZd+s2625gGnGF/t7DSqDs4dP7CrLA1m7jKWz6pps0LpYLJN8Q64HtmPKJ1hrN3nzPNKFEKOUiQ=="
"integrity": "sha512-bCYeRA2rVibKZd+s2625gGnGF/t7DSqDs4dP7CrLA1m7jKWz6pps0LpYLJN8Q64HtmPKJ1hrN3nzPNKFEKOUiQ==",
"dev": true,
"optional": true,
"peer": true
},
"node_modules/is-reference": {
"version": "1.2.1",
@ -3378,6 +3590,9 @@
"version": "22.0.0",
"resolved": "https://registry.npmjs.org/jsdom/-/jsdom-22.0.0.tgz",
"integrity": "sha512-p5ZTEb5h+O+iU02t0GfEjAnkdYPrQSkfuTSMkMYyIoMvUNEHsbG0bHHbfXIcfTqD2UfvjQX7mmgiFsyRwGscVw==",
"dev": true,
"optional": true,
"peer": true,
"dependencies": {
"abab": "^2.0.6",
"cssstyle": "^3.0.0",
@ -3419,6 +3634,9 @@
"version": "4.1.1",
"resolved": "https://registry.npmjs.org/tr46/-/tr46-4.1.1.tgz",
"integrity": "sha512-2lv/66T7e5yNyhAAC4NaKe5nVavzuGJQVVtRYLyQ2OI8tsJ61PMLlelehb0wi2Hx6+hT/OJUWZcw8MjlSRnxvw==",
"dev": true,
"optional": true,
"peer": true,
"dependencies": {
"punycode": "^2.3.0"
},
@ -3430,6 +3648,9 @@
"version": "12.0.1",
"resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-12.0.1.tgz",
"integrity": "sha512-Ed/LrqB8EPlGxjS+TrsXcpUond1mhccS3pchLhzSgPCnTimUCKj3IZE75pAs5m6heB2U2TMerKFUXheyHY+VDQ==",
"dev": true,
"optional": true,
"peer": true,
"dependencies": {
"tr46": "^4.1.1",
"webidl-conversions": "^7.0.0"
@ -3716,6 +3937,9 @@
"version": "1.52.0",
"resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz",
"integrity": "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==",
"dev": true,
"optional": true,
"peer": true,
"engines": {
"node": ">= 0.6"
}
@ -3724,6 +3948,9 @@
"version": "2.1.35",
"resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.35.tgz",
"integrity": "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==",
"dev": true,
"optional": true,
"peer": true,
"dependencies": {
"mime-db": "1.52.0"
},
@ -3878,7 +4105,8 @@
"node_modules/ms": {
"version": "2.1.2",
"resolved": "https://registry.npmjs.org/ms/-/ms-2.1.2.tgz",
"integrity": "sha512-sGkPx+VjMtmA6MX27oA4FBFELFCZZ4S4XqeGOXCv68tT+jb3vk/RyaKWP0PTKyWtmLSM0b+adUTEvbs1PEaH2w=="
"integrity": "sha512-sGkPx+VjMtmA6MX27oA4FBFELFCZZ4S4XqeGOXCv68tT+jb3vk/RyaKWP0PTKyWtmLSM0b+adUTEvbs1PEaH2w==",
"dev": true
},
"node_modules/mz": {
"version": "2.7.0",
@ -3989,10 +4217,24 @@
"node": ">=8"
}
},
"node_modules/nth-check": {
"version": "2.1.1",
"resolved": "https://registry.npmjs.org/nth-check/-/nth-check-2.1.1.tgz",
"integrity": "sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==",
"dependencies": {
"boolbase": "^1.0.0"
},
"funding": {
"url": "https://github.com/fb55/nth-check?sponsor=1"
}
},
"node_modules/nwsapi": {
"version": "2.2.4",
"resolved": "https://registry.npmjs.org/nwsapi/-/nwsapi-2.2.4.tgz",
"integrity": "sha512-NHj4rzRo0tQdijE9ZqAx6kYDcoRwYwSYzCA8MY3JzfxlrvEU0jhnhJT9BhqhJs7I/dKcrDm6TyulaRqZPIhN5g=="
"integrity": "sha512-NHj4rzRo0tQdijE9ZqAx6kYDcoRwYwSYzCA8MY3JzfxlrvEU0jhnhJT9BhqhJs7I/dKcrDm6TyulaRqZPIhN5g==",
"dev": true,
"optional": true,
"peer": true
},
"node_modules/object-assign": {
"version": "4.1.1",
@ -4209,6 +4451,18 @@
"url": "https://github.com/inikulin/parse5?sponsor=1"
}
},
"node_modules/parse5-htmlparser2-tree-adapter": {
"version": "7.0.0",
"resolved": "https://registry.npmjs.org/parse5-htmlparser2-tree-adapter/-/parse5-htmlparser2-tree-adapter-7.0.0.tgz",
"integrity": "sha512-B77tOZrqqfUfnVcOrUvfdLbz4pu4RopLD/4vmu3HUPswwTA8OH0EMW9BlWR2B0RCoiZRAHEUu7IxeP1Pd1UU+g==",
"dependencies": {
"domhandler": "^5.0.2",
"parse5": "^7.0.0"
},
"funding": {
"url": "https://github.com/inikulin/parse5?sponsor=1"
}
},
"node_modules/path-exists": {
"version": "4.0.0",
"resolved": "https://registry.npmjs.org/path-exists/-/path-exists-4.0.0.tgz",
@ -4588,7 +4842,6 @@
"version": "2.8.4",
"resolved": "https://registry.npmjs.org/prettier/-/prettier-2.8.4.tgz",
"integrity": "sha512-vIS4Rlc2FNh0BySk3Wkd6xmwxB0FpOndW5fisM5H8hsZSxU2VWVB5CWIkIjWvrHjIhxk2g3bfMKM87zNTrZddw==",
"dev": true,
"bin": {
"prettier": "bin-prettier.js"
},
@ -4737,7 +4990,10 @@
"node_modules/psl": {
"version": "1.9.0",
"resolved": "https://registry.npmjs.org/psl/-/psl-1.9.0.tgz",
"integrity": "sha512-E/ZsdU4HLs/68gYzgGTkMicWTLPdAftJLfJFlLUAAKZGkStNU72sZjT66SnMDVOfOWY/YAoiD7Jxa9iHvngcag=="
"integrity": "sha512-E/ZsdU4HLs/68gYzgGTkMicWTLPdAftJLfJFlLUAAKZGkStNU72sZjT66SnMDVOfOWY/YAoiD7Jxa9iHvngcag==",
"dev": true,
"optional": true,
"peer": true
},
"node_modules/pump": {
"version": "3.0.0",
@ -4768,7 +5024,10 @@
"node_modules/querystringify": {
"version": "2.2.0",
"resolved": "https://registry.npmjs.org/querystringify/-/querystringify-2.2.0.tgz",
"integrity": "sha512-FIqgj2EUvTa7R50u0rGsyTftzjYmv/a3hO345bZNrqabNqjtgiDMgmo4mkUjd+nzU5oF3dClKqFIPUKybUyqoQ=="
"integrity": "sha512-FIqgj2EUvTa7R50u0rGsyTftzjYmv/a3hO345bZNrqabNqjtgiDMgmo4mkUjd+nzU5oF3dClKqFIPUKybUyqoQ==",
"dev": true,
"optional": true,
"peer": true
},
"node_modules/queue-microtask": {
"version": "1.2.3",
@ -4868,7 +5127,10 @@
"node_modules/requires-port": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/requires-port/-/requires-port-1.0.0.tgz",
"integrity": "sha512-KigOCHcocU3XODJxsu8i/j8T9tzT4adHiecwORRQ0ZZFcp7ahwXuRU1m+yuO90C5ZUyGeGfocHDI14M3L3yDAQ=="
"integrity": "sha512-KigOCHcocU3XODJxsu8i/j8T9tzT4adHiecwORRQ0ZZFcp7ahwXuRU1m+yuO90C5ZUyGeGfocHDI14M3L3yDAQ==",
"dev": true,
"optional": true,
"peer": true
},
"node_modules/resolve": {
"version": "1.22.1",
@ -4938,7 +5200,10 @@
"node_modules/rrweb-cssom": {
"version": "0.6.0",
"resolved": "https://registry.npmjs.org/rrweb-cssom/-/rrweb-cssom-0.6.0.tgz",
"integrity": "sha512-APM0Gt1KoXBz0iIkkdB/kfvGOwC4UuJFeG/c+yV7wSc7q96cG/kJ0HiYCnzivD9SB53cLV1MlHFNfOuPaadYSw=="
"integrity": "sha512-APM0Gt1KoXBz0iIkkdB/kfvGOwC4UuJFeG/c+yV7wSc7q96cG/kJ0HiYCnzivD9SB53cLV1MlHFNfOuPaadYSw==",
"dev": true,
"optional": true,
"peer": true
},
"node_modules/run-parallel": {
"version": "1.2.0",
@ -4996,7 +5261,10 @@
"node_modules/safer-buffer": {
"version": "2.1.2",
"resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz",
"integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg=="
"integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==",
"dev": true,
"optional": true,
"peer": true
},
"node_modules/sander": {
"version": "0.5.1",
@ -5026,6 +5294,9 @@
"version": "6.0.0",
"resolved": "https://registry.npmjs.org/saxes/-/saxes-6.0.0.tgz",
"integrity": "sha512-xAg7SOnEhrm5zI3puOOKyy1OMcMlIJZYNJY7xLBwSze0UjhPLnWfj2GF2EpT0jmzaJKIWKHLsaSSajf35bcYnA==",
"dev": true,
"optional": true,
"peer": true,
"dependencies": {
"xmlchars": "^2.2.0"
},
@ -5611,7 +5882,10 @@
"node_modules/symbol-tree": {
"version": "3.2.4",
"resolved": "https://registry.npmjs.org/symbol-tree/-/symbol-tree-3.2.4.tgz",
"integrity": "sha512-9QNk5KwDF+Bvz+PyObkmSYjI5ksVUYtjW7AU22r2NKcfLJcXp96hkDWU3+XndOsUb+AQ9QhfzfCT2O+CNWT5Tw=="
"integrity": "sha512-9QNk5KwDF+Bvz+PyObkmSYjI5ksVUYtjW7AU22r2NKcfLJcXp96hkDWU3+XndOsUb+AQ9QhfzfCT2O+CNWT5Tw==",
"dev": true,
"optional": true,
"peer": true
},
"node_modules/tailwind-scrollbar": {
"version": "3.0.0",
@ -5790,6 +6064,9 @@
"version": "4.1.3",
"resolved": "https://registry.npmjs.org/tough-cookie/-/tough-cookie-4.1.3.tgz",
"integrity": "sha512-aX/y5pVRkfRnfmuX+OdbSdXvPe6ieKX/G2s7e98f4poJHnqH3281gDPm/metm6E/WRamfx7WC4HUqkWHfQHprw==",
"dev": true,
"optional": true,
"peer": true,
"dependencies": {
"psl": "^1.1.33",
"punycode": "^2.1.1",
@ -5894,6 +6171,19 @@
"node": "*"
}
},
"node_modules/turndown": {
"version": "7.1.2",
"resolved": "https://registry.npmjs.org/turndown/-/turndown-7.1.2.tgz",
"integrity": "sha512-ntI9R7fcUKjqBP6QU8rBK2Ehyt8LAzt3UBT9JR9tgo6GtuKvyUzpayWmeMKJw1DPdXzktvtIT8m2mVXz+bL/Qg==",
"dependencies": {
"domino": "^2.1.6"
}
},
"node_modules/turndown-plugin-gfm": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/turndown-plugin-gfm/-/turndown-plugin-gfm-1.0.2.tgz",
"integrity": "sha512-vwz9tfvF7XN/jE0dGoBei3FXWuvll78ohzCZQuOb+ZjWrs3a0XhQVomJEb2Qh4VHTPNRO4GPZh0V7VRbiWwkRg=="
},
"node_modules/type-check": {
"version": "0.4.0",
"resolved": "https://registry.npmjs.org/type-check/-/type-check-0.4.0.tgz",
@ -5973,6 +6263,9 @@
"version": "0.2.0",
"resolved": "https://registry.npmjs.org/universalify/-/universalify-0.2.0.tgz",
"integrity": "sha512-CJ1QgKmNg3CwvAv/kOFmtnEN05f0D/cn9QntgNOQlQF9dgvVTHj3t+8JPdjqawCHk7V/KA+fbUqzZ9XWhcqPUg==",
"dev": true,
"optional": true,
"peer": true,
"engines": {
"node": ">= 4.0.0"
}
@ -6065,6 +6358,9 @@
"version": "1.5.10",
"resolved": "https://registry.npmjs.org/url-parse/-/url-parse-1.5.10.tgz",
"integrity": "sha512-WypcfiRhfeUP9vvF0j6rw0J3hrWrw6iZv3+22h6iRMJ/8z1Tj6XfLP4DsUix5MhMPnXpiHDoKyoZ/bdCkwBCiQ==",
"dev": true,
"optional": true,
"peer": true,
"dependencies": {
"querystringify": "^2.1.1",
"requires-port": "^1.0.0"
@ -6253,6 +6549,9 @@
"version": "4.0.0",
"resolved": "https://registry.npmjs.org/w3c-xmlserializer/-/w3c-xmlserializer-4.0.0.tgz",
"integrity": "sha512-d+BFHzbiCx6zGfz0HyQ6Rg69w9k19nviJspaj4yNscGjrHu94sVP+aRm75yEbCh+r2/yR+7q6hux9LVtbuTGBw==",
"dev": true,
"optional": true,
"peer": true,
"dependencies": {
"xml-name-validator": "^4.0.0"
},
@ -6296,6 +6595,9 @@
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/whatwg-encoding/-/whatwg-encoding-2.0.0.tgz",
"integrity": "sha512-p41ogyeMUrw3jWclHWTQg1k05DSVXPLcVxRTYsXUk+ZooOCZLcoYgPZ/HL/D/N+uQPOtcp1me1WhBEaX02mhWg==",
"dev": true,
"optional": true,
"peer": true,
"dependencies": {
"iconv-lite": "0.6.3"
},
@ -6307,6 +6609,9 @@
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/whatwg-mimetype/-/whatwg-mimetype-3.0.0.tgz",
"integrity": "sha512-nt+N2dzIutVRxARx1nghPKGv1xHikU7HKdfafKkLNLindmPU/ch3U31NOCGGA/dmPcmb1VlofO0vnKAcsm0o/Q==",
"dev": true,
"optional": true,
"peer": true,
"engines": {
"node": ">=12"
}
@ -6397,6 +6702,9 @@
"version": "4.0.0",
"resolved": "https://registry.npmjs.org/xml-name-validator/-/xml-name-validator-4.0.0.tgz",
"integrity": "sha512-ICP2e+jsHvAj2E2lIHxa5tjXRlKDJo4IdvPvCXbXQGdzSfmSpNVyIKMvoZHjDY9DP0zV17iI85o90vRFXNccRw==",
"dev": true,
"optional": true,
"peer": true,
"engines": {
"node": ">=12"
}
@ -6404,7 +6712,10 @@
"node_modules/xmlchars": {
"version": "2.2.0",
"resolved": "https://registry.npmjs.org/xmlchars/-/xmlchars-2.2.0.tgz",
"integrity": "sha512-JZnDKK8B0RCDw84FNdDAIpZK+JuJw+s7Lz8nksI7SIuU3UXJJslUthsi+uWBUYOwPFwW7W7PRLRfUKpxjtjFCw=="
"integrity": "sha512-JZnDKK8B0RCDw84FNdDAIpZK+JuJw+s7Lz8nksI7SIuU3UXJJslUthsi+uWBUYOwPFwW7W7PRLRfUKpxjtjFCw==",
"dev": true,
"optional": true,
"peer": true
},
"node_modules/yallist": {
"version": "4.0.0",

View File

@ -24,13 +24,14 @@
"@types/jsdom": "^21.1.1",
"@types/marked": "^4.0.8",
"@types/parquetjs": "^0.10.3",
"@types/prettier": "^2.7.3",
"@types/turndown": "^5.0.2",
"@typescript-eslint/eslint-plugin": "^6.x",
"@typescript-eslint/parser": "^6.x",
"eslint": "^8.28.0",
"eslint-config-prettier": "^8.5.0",
"eslint-plugin-svelte": "^2.30.0",
"marked-katex-extension": "^3.0.6",
"prettier": "^2.8.0",
"prettier-plugin-svelte": "^2.10.1",
"prettier-plugin-tailwindcss": "^0.2.7",
"svelte": "^4.0.0",
@ -49,20 +50,23 @@
"@xenova/transformers": "^2.6.0",
"autoprefixer": "^10.4.14",
"aws4fetch": "^1.0.17",
"cheerio": "^1.0.0-rc.12",
"date-fns": "^2.29.3",
"dotenv": "^16.0.3",
"handlebars": "^4.7.8",
"highlight.js": "^11.7.0",
"jsdom": "^22.0.0",
"marked": "^4.3.0",
"mongodb": "^5.8.0",
"nanoid": "^4.0.2",
"openid-client": "^5.4.2",
"parquetjs": "^0.11.2",
"postcss": "^8.4.31",
"prettier": "^2.8.0",
"serpapi": "^1.1.1",
"tailwind-scrollbar": "^3.0.0",
"tailwindcss": "^3.3.1",
"turndown": "^7.1.2",
"turndown-plugin-gfm": "^1.0.2",
"zod": "^3.22.3"
}
}

View File

@ -43,7 +43,7 @@ export async function buildPrompt({
${webSearch.context}
=====================
${previousQuestions}
Answer the question: ${lastMsg.content}
Answer the question: ${lastMsg.content} in a concise manner. When asked about past or future events, remember that today is ${currentDate}.
`,
},
];

View File

@ -42,7 +42,7 @@ Current Question: Where is it being hosted ?`,
from: "assistant",
content: "Epson F2270 DTG printer printhead",
},
{ from: "user", content: "What were the news yesterday ?" },
{ from: "user", content: "What happened yesterday?" },
{
from: "assistant",
content: `news ${format(new Date(Date.now() - 864e5), "MMMM d, yyyy")}`,

View File

@ -0,0 +1,3 @@
declare module "turndown-plugin-gfm" {
export const tables: TurndownService.Plugin;
}

View File

@ -1,32 +1,48 @@
import { JSDOM, VirtualConsole } from "jsdom";
import { load as cheerioLoad } from "cheerio";
import TurndownService from "turndown";
import { tables } from "turndown-plugin-gfm";
import prettier from "prettier";
export async function parseWeb(url: string) {
const turndownService = new TurndownService();
turndownService.use(tables);
export async function parseWebintoMarkdown(url: string) {
const abortController = new AbortController();
setTimeout(() => abortController.abort(), 10000);
const htmlString = await fetch(url, { signal: abortController.signal })
.then((response) => response.text())
.catch();
const virtualConsole = new VirtualConsole();
virtualConsole.on("error", () => {
// No-op to skip console errors.
const $ = cheerioLoad(htmlString);
// Remove all CSS, including inline ones
$("style").remove();
$("*").removeAttr("style");
// Remove all <script> elements within <body>
$("body script").remove();
// Replace links with just their text
$("a").replaceWith((_idx, el) => {
return $(el).text();
});
// put the html string into a DOM
const dom = new JSDOM(htmlString ?? "", {
virtualConsole,
// Replace images with just their alt text
$("img").replaceWith((_idx, el) => {
return $(el).attr("alt") || "Image";
});
const { document } = dom.window;
const textElTags = "p";
const paragraphs = document.querySelectorAll(textElTags);
if (!paragraphs.length) {
throw new Error(`webpage doesn't have any "${textElTags}" element`);
const htmlBody = $("body").html();
if (!htmlBody) {
throw new Error(`Couldn't parse html body for ${url}`);
}
const paragraphTexts = Array.from(paragraphs).map((p) => p.textContent);
// combine text contents from paragraphs and then remove newlines and multiple spaces
const text = paragraphTexts.join(" ").replace(/ {2}|\r\n|\n|\r/gm, "");
return text;
const markdownRaw = turndownService.turndown(htmlBody);
const potentialMarkdown = prettier.format(markdownRaw, { parser: "markdown" });
const markdown = getMarkdownSection(potentialMarkdown);
if (!markdown) {
throw new Error(`Couldn't parse markdown for ${url}`);
}
return markdown;
}
function getMarkdownSection(markdown: string) {
const REGEX_MD_HEADING = /^#+\s/m;
const idx = markdown.search(REGEX_MD_HEADING);
const afterHeading = idx !== -1 ? markdown.substring(idx).trim() : "";
return afterHeading;
}

View File

@ -1,19 +1,20 @@
import { searchWeb } from "$lib/server/websearch/searchWeb";
import type { Message } from "$lib/types/Message";
import type { WebSearch, WebSearchSource } from "$lib/types/WebSearch";
import type { TextWithSource, WebSearch } from "$lib/types/WebSearch";
import { generateQuery } from "$lib/server/websearch/generateQuery";
import { parseWeb } from "$lib/server/websearch/parseWeb";
import { chunk } from "$lib/utils/chunk";
import { parseWebintoMarkdown } from "$lib/server/websearch/parseWeb";
import {
MAX_SEQ_LEN as CHUNK_CAR_LEN,
SIMILARITY_SCORE_THRESHOLD,
findSimilarSentences,
} from "$lib/server/websearch/sentenceSimilarity";
import type { Conversation } from "$lib/types/Conversation";
import type { MessageUpdate } from "$lib/types/MessageUpdate";
import { parseMarkdown, chunkSlidingWindow } from "./slidingWindowChunker";
import { getWebSearchProvider } from "./searchWeb";
const MAX_N_PAGES_SCRAPE = 10 as const;
const MAX_N_PAGES_SCRAPE = 20 as const;
const MAX_N_PAGES_EMBED = 5 as const;
const MAX_N_CHUNKS_PER_SOURCE = 40 as const;
export async function runWebSearch(
conv: Conversation,
@ -53,57 +54,77 @@ export async function runWebSearch(
[];
webSearch.results = webSearch.results
.filter(({ link }) => !link.includes("youtube.com")) // filter out youtube links
.slice(0, MAX_N_PAGES_SCRAPE); // limit to first 10 links only
.slice(0, MAX_N_PAGES_SCRAPE);
let paragraphChunks: { source: WebSearchSource; text: string }[] = [];
if (webSearch.results.length > 0) {
if (!webSearch.results.length) {
throw new Error("No results found for this search query");
}
let paragraphChunks: TextWithSource[] = [];
while (webSearch.results.length) {
const webSources = webSearch.results.slice(0, MAX_N_PAGES_EMBED);
webSearch.results = webSearch.results.slice(MAX_N_PAGES_EMBED);
appendUpdate("Browsing results");
const promises = webSearch.results.map(async (result) => {
const { link } = result;
let text = result.text ?? "";
if (!text) {
const promises = webSources.map(async (source) => {
const { link } = source;
let markdown = source.text ?? "";
if (!markdown) {
try {
text = await parseWeb(link);
markdown = await parseWebintoMarkdown(link);
appendUpdate("Browsing webpage", [link]);
} catch (e) {
// ignore errors
}
}
const MAX_N_CHUNKS = 100;
const texts = chunk(text, CHUNK_CAR_LEN).slice(0, MAX_N_CHUNKS);
return texts.map((t) => ({ source: result, text: t }));
return { text: markdown, source } as TextWithSource;
});
const nestedParagraphChunks = (await Promise.all(promises)).slice(0, MAX_N_PAGES_EMBED);
paragraphChunks = nestedParagraphChunks.flat();
if (!paragraphChunks.length) {
throw new Error("No text found on the first 5 results");
// chunk and do all the things here
const markdownsWithSource = await Promise.all(promises);
// increasee the character limit there
const markdownFlatNodes = markdownsWithSource.map((item) => parseMarkdown(item)).flat();
paragraphChunks = chunkSlidingWindow(markdownFlatNodes, {
windowWidth: 1024,
paddingWidth: 100,
}).slice(0, MAX_N_CHUNKS_PER_SOURCE);
appendUpdate("Extracting relevant information");
const topKClosestParagraphs = 5;
const texts = paragraphChunks.map(({ text }) => text);
const similarityResults = await findSimilarSentences(prompt, texts, {
topK: topKClosestParagraphs,
});
// if there was no similar text chunks to the query, embed the next set of web pages
if (similarityResults[0].score > SIMILARITY_SCORE_THRESHOLD) {
continue;
}
} else {
throw new Error("No results found for this search query");
const indices = similarityResults.map((item) => item.index);
webSearch.context = indices.map((idx) => texts[idx]).join(" ");
const usedSources = new Set<string>();
for (const idx of indices) {
const { source } = paragraphChunks[idx];
if (!usedSources.has(source.link)) {
usedSources.add(source.link);
webSearch.contextSources.push(source);
}
}
updatePad({
type: "webSearch",
messageType: "sources",
message: "sources",
sources: webSearch.contextSources,
});
// break from the loop of creating emebddings for web pages
break;
}
appendUpdate("Extracting relevant information");
const topKClosestParagraphs = 8;
const texts = paragraphChunks.map(({ text }) => text);
const indices = await findSimilarSentences(prompt, texts, {
topK: topKClosestParagraphs,
});
webSearch.context = indices.map((idx) => texts[idx]).join("");
const usedSources = new Set<string>();
for (const idx of indices) {
const { source } = paragraphChunks[idx];
if (!usedSources.has(source.link)) {
usedSources.add(source.link);
webSearch.contextSources.push(source);
}
if (!webSearch.context) {
throw new Error("Web search couldn't find relevant information.");
}
updatePad({
type: "webSearch",
messageType: "sources",
message: "sources",
sources: webSearch.contextSources,
});
} catch (searchError) {
if (searchError instanceof Error) {
appendUpdate(

View File

@ -1,5 +1,8 @@
import type { Tensor } from "@xenova/transformers";
import { pipeline, dot } from "@xenova/transformers";
import { pipeline, dot, env } from "@xenova/transformers";
// using explicit cache dir for tf.js models
env.cacheDir = "./local-models/revision1";
// see here: https://github.com/nmslib/hnswlib/blob/359b2ba87358224963986f709e593d799064ace6/README.md?plain=1#L34
function innerProduct(tensor1: Tensor, tensor2: Tensor) {
@ -8,8 +11,8 @@ function innerProduct(tensor1: Tensor, tensor2: Tensor) {
const modelId = "Xenova/gte-small";
const extractor = await pipeline("feature-extraction", modelId);
// see https://huggingface.co/thenlper/gte-small/blob/d8e2604cadbeeda029847d19759d219e0ce2e6d8/README.md?code=true#L2625
export const MAX_SEQ_LEN = 512 as const;
export const SIMILARITY_SCORE_THRESHOLD = 0.2;
export async function findSimilarSentences(
query: string,
@ -22,19 +25,19 @@ export async function findSimilarSentences(
const queryTensor: Tensor = output[0];
const sentencesTensor: Tensor = output.slice([1, input.length - 1]);
const distancesFromQuery: { distance: number; index: number }[] = [...sentencesTensor].map(
const distancesFromQuery: { score: number; index: number }[] = [...sentencesTensor].map(
(sentenceTensor: Tensor, index: number) => {
return {
distance: innerProduct(queryTensor, sentenceTensor),
score: innerProduct(queryTensor, sentenceTensor),
index: index,
};
}
);
distancesFromQuery.sort((a, b) => {
return a.distance - b.distance;
return a.score - b.score;
});
// Return the indexes of the closest topK sentences
return distancesFromQuery.slice(0, topK).map((item) => item.index);
return distancesFromQuery.slice(0, topK);
}

View File

@ -0,0 +1,76 @@
import type { MarkdownNode, MarkdownFlatNode, TextWithSource } from "../../types/WebSearch";
// Given a markdown string, parse it into nested markdown tree structure
// For example, `## My Heading 2` would be a child of `# My Heading 1`
export function parseMarkdown(markdownWithSource: TextWithSource) {
const REGEX_MD_HEADING = /^((#{1,6}) .+)\n+/gm;
const { text: markdown, source } = markdownWithSource;
const sections = markdown.split(REGEX_MD_HEADING);
let nodes: MarkdownNode[] = [];
for (let i = 1; i < sections.length; i += 3) {
const heading = sections[i + 0];
const depth = sections[i + 1].length;
const content = sections[i + 2];
const node: MarkdownNode = { heading, depth, content, source, sections: [] };
nodes = addToTree(nodes, node);
}
return flattenNodes(nodes);
}
// Helper function to `parseMarkdown`
function addToTree(nodes: MarkdownNode[], node: MarkdownNode): MarkdownNode[] {
if (nodes.length === 0 || nodes[nodes.length - 1].depth >= node.depth) {
nodes.push(node);
} else {
const sections = nodes[nodes.length - 1].sections || [];
nodes[nodes.length - 1].sections = addToTree(sections, node);
}
return nodes;
}
function flattenNodes(nodes: MarkdownNode[]): MarkdownFlatNode[] {
const flatNodes: MarkdownFlatNode[] = [];
for (const node of nodes) {
flattenNode(node, [], flatNodes);
}
return flatNodes;
}
// Helper function to `flattenNodes`
function flattenNode(node: MarkdownNode, suffix: string[], flatNodes: MarkdownFlatNode[]) {
const newSuffix = [...suffix, node.heading];
const { content, source } = node;
const flatNode: MarkdownFlatNode = { heading: newSuffix.join("\n\n"), content, source };
flatNodes.push(flatNode);
for (const section of node.sections) {
flattenNode(section, newSuffix, flatNodes);
}
}
type Character = number;
export function chunkSlidingWindow(
flatNodes: MarkdownFlatNode[],
opts: { windowWidth: Character; paddingWidth: Character }
) {
const { windowWidth, paddingWidth } = opts;
const results: TextWithSource[] = [];
for (const node of flatNodes) {
let i = 0;
const { content, source } = node;
let { heading } = node;
heading = heading + "\n\n";
const width = windowWidth - 2 * paddingWidth - heading.length;
while (i <= content.length) {
const startIdx = Math.max(0, i - paddingWidth);
const endIdx = i + width + paddingWidth;
const slice = content.slice(startIdx, endIdx);
const textWithSource = { text: heading + slice, source } as TextWithSource;
results.push(textWithSource);
i = i + width;
}
}
return results;
}

View File

@ -26,6 +26,21 @@ export type WebSearchMessageSources = {
sources: WebSearchSource[];
};
export interface MarkdownNode {
heading: string;
depth: number;
content: string;
source: WebSearchSource;
sections: MarkdownNode[];
}
export type MarkdownFlatNode = Pick<MarkdownNode, "heading" | "content" | "source">;
export interface TextWithSource {
text: string;
source: WebSearchSource;
}
export interface YouWebSearch {
hits: YouSearchHit[];
latency: number;