Ai
1 Star 4 Fork 2

Gitee 极速下载/spark-nlp

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
此仓库是为了提升国内下载速度的镜像仓库,每日同步一次。 原始仓库: https://github.com/JohnSnowLabs/spark-nlp
克隆/下载
search_index.rb 18.22 KB
一键复制 编辑 原始数据 按行查看 历史
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620
require 'set'
require 'uri'
require 'net/http'
require 'json'
require 'date'
require 'elasticsearch'
require 'nokogiri'
require 'aws-sdk-s3'
BUCKET_NAME="pypi.johnsnowlabs.com"
SEARCH_URL = (ENV["SEARCH_ORIGIN"] || 'https://search.modelshub.johnsnowlabs.com') + '/'
ELASTICSEARCH_INDEX_NAME = ENV["ELASTICSEARCH_INDEX_NAME"] || 'models'
ORIGIN = ENV["ORIGIN"] || ""
OUTDATED_EDITIONS = ['Spark NLP 2.0', 'Spark NLP 2.1', 'Healthcare NLP 2.0']
SPARK_NLP_ORIGIN = "https://sparknlp.org"
JOHNSNOWLABS_ORIGIN = "https://nlp.johnsnowlabs.com"
$remote_editions = Set.new
def upload_file_to_s3_bucket(file_path)
s3 = Aws::S3::Client.new(region: 'eu-west-1')
object_key = "public/models.json"
begin
s3.put_object(bucket: BUCKET_NAME, key: object_key, body: File.open(file_path, 'rb'), acl: 'public-read')
puts "File uploaded successfully to #{BUCKET_NAME}/#{object_key}"
rescue Aws::S3::Errors::ServiceError => e
puts "Failed to upload file: #{e.message}"
end
end
class Version < Array
def initialize name
m = /(\d+\.\d+)\z/.match(name)
super(m ? m[1].split('.').map(&:to_i) : [])
end
def < x
(self <=> x) < 0
end
def <= x
(self <=> x) <= 0
end
end
def edition_name(old_product)
old_to_new_product_name = {
'Spark NLP for Legal'=> 'Legal NLP',
'Spark NLP for Finance'=> 'Finance NLP',
'Spark OCR'=> 'Visual NLP',
'Spark NLP for Healthcare'=> 'Healthcare NLP',
}
old_to_new_product_name.each do |key, value|
if old_product.include? key
return old_product.sub(key, value)
end
end
old_product
end
def compatible_editions(editions, model_editions, edition)
return edition if edition.to_s.empty?
return edition if OUTDATED_EDITIONS.include? edition
def to_product_name(edition)
m = /^(.*?) \d+\.\d+$/.match(edition)
return m ? m[1] : nil
end
product_name = to_product_name(edition)
if product_name
selection_lambda = lambda {|v| to_product_name(v) == product_name && !OUTDATED_EDITIONS.include?(v)}
else
selection_lambda = lambda {|_| false }
end
editions = editions.select &selection_lambda
model_editions = model_editions.select &selection_lambda
curr_index = model_editions.index(edition)
return edition if curr_index.nil?
min = edition
max = model_editions[curr_index + 1].nil? ? nil : model_editions[curr_index + 1]
editions.select do |v|
if min && max
(Version.new(min) <= Version.new(v)) && (Version.new(v) < Version.new(max))
elsif max
Version.new(v) < Version.new(max)
else
Version.new(min) <= Version.new(v)
end
end
end
def editions_changed?(edition)
if $remote_editions.empty?
puts "Retrieving remote editions...."
uri = URI(SEARCH_URL)
res = Net::HTTP.get_response(uri)
if res.is_a?(Net::HTTPSuccess)
data = JSON.parse(res.body)
editions = data['meta']['aggregations']['editions']
$remote_editions = editions.to_set
end
end
local_editions = Set.new
local_editions << edition
return !(local_editions - OUTDATED_EDITIONS).to_set.subset?($remote_editions)
end
def to_product_name(edition_short)
m = /^(.*?)\s\d+\.\d+$/.match(edition_short)
m ? m[1] : nil
end
class Extractor
def initialize(content)
@content = content
end
def download_link
m = /\[Download\]\((.*?)\)/.match(@content)
m ? m[1] : nil
end
def doc_type
m = /\|Type:\|(.*?)\|/.match(@content)
if m
return m[1] == 'pipeline' ? 'pipeline': 'model'
end
nil
end
def print_error(message)
if ENV["DEBUG"]
print(message + "\n")
end
end
def origin
m = /\|\s*License:\s*\|\s*Licensed\s*\|/m.match(@content)
m ? JOHNSNOWLABS_ORIGIN : SPARK_NLP_ORIGIN
end
def predicted_entities
m = /## Predicted Entities(.*?)(##|{:\.btn-box\})/m.match(@content)
if m
buf = m[1].strip
strategies = ['newline_separated', 'comma_separated']
while strategies.length > 0 do
case strategies.pop
when 'comma_separated'
items = self.comma_separated_predicted_entities(buf)
when 'newline_separated'
items = self.newline_separated_predicted_entities(buf)
end
return items if items
end
end
nil
end
def benchmarking_results(post_url)
if @content.include? '## Benchmarking'
m = /## Benchmarking(\r\n|\r|\n)+```bash(.*?)```/m.match(@content)
if m
buf = m[2].strip
lines = buf.split("\n")
col_index_to_header_mapping = {}
return_data = []
lines.each_with_index do |line, index|
if index == 0
# This is a header row
headers = line.split
if headers.include?('|')
print_error("Failed to parse the Benchmarking section (invalid syntax) #{post_url}")
return nil
end
unless headers.include?('label')
print_error("Failed to parse the Benchmarking section (the label header is missing) #{post_url}")
return nil
end
headers.each_with_index do |header, i|
if header == 'label'
header = 'name'
end
col_index_to_header_mapping[i] = header
end
else
row_data = {}
values = line.split
if values.length != col_index_to_header_mapping.keys.count
print_error("Failed to parse the Benchmarking section (different column and cell count) #{post_url}")
return nil
end
values.each_with_index do |value, j|
if value.include?("prec:") or value.include?("rec:") or value.include?("f1:")
print_error("Failed to parse the Benchmarking section (cells contains columns) #{post_url}")
return nil
end
row_data[col_index_to_header_mapping[j]]=value
end
unless row_data.empty?
return_data << row_data
end
end
end
return return_data
else
print_error("Failed to parse the Benchmarking section (invalid section) #{post_url}")
end
end
nil
end
def references_results(post_url)
if @content.include? '## References'
m = /^## References[^#]+([\W\w]*?)($)/m.match(@content)
if m
references_section = m[0]
url_scans = references_section.scan(URI.regexp)
return url_scans.map do |url_parts|
if not url_parts.one?
url_parts.each_with_index.map do |part, index|
if index == 0
part = part + "://"
end
part
end.join.delete_suffix('.').delete_suffix(')')
end
end.compact.uniq
end
end
nil
end
private
def comma_separated_predicted_entities(buf)
has_comma = buf.include?(',')
has_apostrophe = buf.include?('`')
buf = buf.gsub("\n", ' ')
return nil unless has_comma || (has_apostrophe && buf.match?(/\A\S+\z/))
buf.scan(/`[^`]+`/).map { |v| v.gsub(/["`]/, '') }.select { |v| !v.empty? && !v.include?(',') && !v.include?('|') }
end
def newline_separated_predicted_entities(buf)
return nil unless buf.include? "\n"
return nil unless buf.strip.start_with? '-'
buf.split("\n").collect { |v| v.gsub(/^-\s?/, '').gsub('`', '').strip }.select { |v| !v.empty? }
end
end
class BulkIndexer
def initialize(client)
@client = client
@buffer = []
end
def index(id, data)
@buffer << { update: { _id: id, data: {doc: data, doc_as_upsert: true}} }
self.execute if @buffer.length >= 500
end
def execute
return nil unless @client
return nil if @buffer.empty?
puts "Indexing #{@buffer.length} models..."
@client.bulk(index: ELASTICSEARCH_INDEX_NAME, body: @buffer)
@buffer.clear
end
end
editions = Set.new
# Add the editions that are absent in the existing models yet
editions << 'Spark NLP 5.3'
uniq_to_models_mapping = {}
name_language_editions_sparkversion_to_models_mapping = {}
models_json = {}
models_benchmarking_json = {}
models_references_json = {}
all_posts_id = []
all_deleted_posts = []
def is_latest?(group, model)
models = group[model[:uniq_key]]
Date.parse(model[:date]) == models.map { |m| Date.parse(m[:date])}.max
end
Jekyll::Hooks.register :posts, :pre_render do |post|
extractor = Extractor.new(post.content)
doc_type = extractor.doc_type
if doc_type.nil?
doc_type = post.data['tags'].include?('pipeline') ? 'pipeline' : 'model'
end
post.data['edition'] = edition_name(post.data['edition'])
post.data["origin"] = extractor.origin
models_json[post.url] = {
title: post.data['title'],
date: post.data['date'].strftime('%B %d, %Y'),
name: post.data['name'],
class: post.data['class'] || "",
language: post.data['language'],
task: post.data['task'],
edition: post.data['edition'],
categories: post.data['categories'],
url: post.url,
tags: post.data['tags'],
download_link: extractor.download_link,
predicted_entities: extractor.predicted_entities || [],
type: doc_type,
annotator: post.data['annotator'] || ""
}
benchmarking_info = extractor.benchmarking_results(post.url)
if benchmarking_info
models_benchmarking_json[post.url] = benchmarking_info
end
references = extractor.references_results(post.url)
if references
models_references_json[post.url] = references
end
end
Jekyll::Hooks.register :posts, :post_render do |post|
m = !post.data['edition'].nil? && /(.*)\.(\d+)\z/.match(post.data['edition'])
edition_short = m.is_a?(MatchData) ? m[1] : ''
body = Nokogiri::HTML(post.content)
body.search('h2, pre, table, .btn-box, .tabs-box, .highlight').remove
body = body.text.gsub(/\s+/, ' ').strip
language = post.data['language']
languages = [language]
if language == 'xx'
languages = post.data['tags'].select do |v|
v.length <= 3 and /^[a-z]+$/.match?(v) and v != 'ner' and v != 'use'
end
end
supported = !!post.data['supported']
deprecated = !!post.data['deprecated']
recommended = !!post.data['recommended']
key = "#{post.data['name']}_#{post.data['language']}_#{post.data['edition']}_#{post.data["spark_version"]}"
model = {
id: post.url,
name: post.data['name'],
title: post.data['title'],
tags_glued: post.data['tags'].join(' '),
tags: post.data['tags'],
task: post.data['task'],
language: language,
languages: languages,
edition: post.data['edition'],
edition_short: edition_short,
date: post.data['date'].strftime('%F'),
supported: supported && !deprecated,
deprecated: deprecated,
body: body,
url: post.url,
recommended: recommended,
annotator: post.data['annotator'],
uniq_key: key,
origin: post.data["origin"],
}
uniq = "#{post.data['name']}_#{post.data['language']}"
uniq_to_models_mapping[uniq] = [] unless uniq_to_models_mapping.has_key? uniq
uniq_to_models_mapping[uniq] << model
unless edition_short.empty?
editions.add(edition_short)
if not ENV['FULL_BUILD'] and editions_changed?(edition_short)
print("Please retry again with full build. New edition #{edition_short} encountered.")
# Write to $GITHUB_OUPUT for CI/CD
if ENV["GITHUB_OUTPUT"]
open(ENV["GITHUB_OUTPUT"], 'a') do |f|
f << "require_full_build=true"
end
end
# exit raises a SystemExit exception with exit code 11
exit(11)
end
end
name_language_editions_sparkversion_to_models_mapping[key] = [] unless name_language_editions_sparkversion_to_models_mapping.has_key? key
name_language_editions_sparkversion_to_models_mapping[key] << model
all_posts_id << model[:id]
end
client = nil
unless ENV['ELASTICSEARCH_URL'].to_s.empty?
if ORIGIN.empty?
print("Cannot proceed further. Please set ENV variable ORIGIN")
exit(1)
end
puts "Connecting to Elasticsearch..."
client = Elasticsearch::Client.new(
url: ENV['ELASTICSEARCH_URL'],
transport_options: {
headers: {
'Authorization': "Bearer #{ENV['ELASTICSEARCH_ACCESS_TOKEN']}"
},
request: {
open_timeout: 60,
timeout: 60,
},
},
)
exists = client.indices.exists index: ELASTICSEARCH_INDEX_NAME
puts "Index already exists: #{exists}"
unless exists
client.indices.create index: ELASTICSEARCH_INDEX_NAME, body: {
"mappings": {
"properties": {
"body": {
"type": "text",
"analyzer": "english"
},
"date": {
"type": "date",
"format": "yyyy-MM-dd"
},
"edition": {
"type": "keyword"
},
"edition_short": {
"type": "keyword"
},
"language": {
"type": "keyword"
},
"languages": {
"type": "keyword"
},
"supported": {
"type": "boolean"
},
"deprecated": {
"type": "boolean"
},
"tags": {
"type": "keyword"
},
"tags_glued": {
"type": "text"
},
"predicted_entities": {
"type": "keyword"
},
"type": {
"type": "keyword"
},
"task": {
"type": "keyword"
},
"name": {
"type": "text",
"analyzer": "simple"
},
"title": {
"type": "text",
"analyzer": "english"
},
"url": {
"type": "keyword"
},
"download_link": {
"type": "keyword"
},
"views": {
"type": "integer"
},
"downloads": {
"type": "integer"
},
"recommended": {
"type": "boolean"
},
"annotator": {
"type": "keyword"
},
"origin": {
"type": "keyword"
}
}
}
}
end
end
Jekyll::Hooks.register :site, :post_render do |site|
is_incremental = site.config['incremental']
bulk_indexer = BulkIndexer.new(client)
uniq_to_models_mapping.each do |uniq, items|
items.sort_by! { |v| v[:edition_short] }
model_editions = items.map { |v| v[:edition_short] }.uniq
items.each do |model|
next_edition_short = compatible_editions(
editions,
model_editions,
model[:edition_short]
)
product_name = to_product_name(model[:edition_short])
model[:edition_short] = next_edition_short
if product_name
case model[:edition_short]
when Array
model[:edition_short] << product_name
when String
model[:edition_short] = [model[:edition_short], product_name]
end
end
models_json[model[:url]][:compatible_editions] = next_edition_short.empty? ? [] : Array(next_edition_short)
# Add model_type to search models
model[:type] = models_json[model[:url]][:type]
# Add predicted entities to search models
model[:predicted_entities] = models_json[model[:url]][:predicted_entities]
# Add download_link to search models
model[:download_link] = models_json[model[:url]][:download_link]
if client
if is_latest?(name_language_editions_sparkversion_to_models_mapping, model)
id = model.delete(:id)
model.delete(:uniq_key)
bulk_indexer.index(id, model)
end
end
end
end
bulk_indexer.execute
if client and (not is_incremental or ENV["FULL_BUILD"])
# For full build, remove all documents not in site.posts and belonging to the origin
client.delete_by_query index: ELASTICSEARCH_INDEX_NAME, body: {query: {bool: { must: { match: {origin: ORIGIN }}, must_not: {ids: {values: all_posts_id}}}}}
end
end
Jekyll::Hooks.register :site, :post_write do |site|
is_incremental = site.config['incremental']
backup_filename = File.join(site.config['source'], 'backup-models.json')
backup_benchmarking_filename = File.join(site.config['source'], 'backup-benchmarking.json')
backup_references_filename = File.join(site.config['source'], 'backup-references.json')
if is_incremental
# Read from backup and merge with incremental posts data
begin
backup_models_data = JSON.parse(File.read(backup_filename))
rescue
backup_models_data = Hash.new
end
begin
backup_benchmarking_data = JSON.parse(File.read(backup_benchmarking_filename))
rescue
backup_benchmarking_data = Hash.new
end
begin
backup_references_data = JSON.parse(File.read(backup_references_filename))
rescue
backup_references_data = Hash.new
end
# Remove deleted posts
all_deleted_posts.each do |url|
backup_models_data.delete(url)
backup_references_data.delete(url)
backup_benchmarking_data.delete(url)
end
models_json = backup_models_data.merge(models_json)
models_benchmarking_json = backup_benchmarking_data.merge(models_benchmarking_json)
models_references_json = backup_references_data.merge(models_references_json)
end
filename = File.join(site.config['destination'], 'backup-modelss3.json')
File.write(filename, models_json.values.to_json)
File.write(backup_filename, models_json.to_json)
# models.json moved to pypi s3 bucket
upload_file_to_s3_bucket(filename)
File.delete(filename)
benchmarking_filename = File.join(site.config['destination'], 'benchmarking.json')
File.write(benchmarking_filename, models_benchmarking_json.to_json)
File.write(backup_benchmarking_filename, models_benchmarking_json.to_json)
references_filename = File.join(site.config['destination'], 'references.json')
File.write(references_filename, models_references_json.to_json)
File.write(backup_references_filename, models_references_json.to_json)
end
Jekyll::Hooks.register :clean, :on_obsolete do |files|
all_deleted_posts = files
.select {|v| v.include?('/docs/_site') and v.end_with?('.html')}
.map {|v| v.split('/docs/_site')[1]}
if client
client.delete_by_query index: ELASTICSEARCH_INDEX_NAME, body: {query: {bool: {must: [ {ids: {values: all_deleted_posts}}, {match: {origin: ORIGIN }} ]}}}
end
end
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Java
1
https://gitee.com/mirrors/spark-nlp.git
git@gitee.com:mirrors/spark-nlp.git
mirrors
spark-nlp
spark-nlp
master

搜索帮助