Ai
8 Star 9 Fork 9

Gitee 极速下载/MMLSpark

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
此仓库是为了提升国内下载速度的镜像仓库,每日同步一次。 原始仓库: https://github.com/Azure/mmlspark
克隆/下载
build.sbt 13.89 KB
一键复制 编辑 原始数据 按行查看 历史
Brendan Walsh 提交于 2025-10-29 07:19 +08:00 . chore: Upgrade to Spark 3.5
import BuildUtils._
import org.apache.commons.io.FileUtils
import sbt.ExclusionRule
import java.io.File
import scala.xml.transform.{RewriteRule, RuleTransformer}
import scala.xml.{Node => XmlNode, NodeSeq => XmlNodeSeq, _}
val condaEnvName = "synapseml"
val sparkVersion = "3.5.0"
name := "synapseml"
ThisBuild / organization := "com.microsoft.azure"
ThisBuild / scalaVersion := "2.12.17"
val scalaMajorVersion = 2.12
val excludes = Seq(
ExclusionRule("org.apache.spark", s"spark-tags_$scalaMajorVersion"),
ExclusionRule("org.scalatest"),
ExclusionRule("org.scalanlp", s"breeze_$scalaMajorVersion")
)
val coreDependencies = Seq(
// Excluding protobuf-java, as spark-core is bringing the older version transitively.
"org.apache.spark" %% "spark-core" % sparkVersion % "compile" exclude("com.google.protobuf", "protobuf-java"),
"org.apache.spark" %% "spark-mllib" % sparkVersion % "compile",
"org.apache.spark" %% "spark-avro" % sparkVersion % "compile",
"org.apache.spark" %% "spark-tags" % sparkVersion % "test",
"com.globalmentor" % "hadoop-bare-naked-local-fs" % "0.1.0" % "test",
"org.scalatest" %% "scalatest" % "3.2.14" % "test")
val extraDependencies = Seq(
"commons-lang" % "commons-lang" % "2.6",
"org.scalactic" %% "scalactic" % "3.2.14",
"io.spray" %% "spray-json" % "1.3.5",
"com.jcraft" % "jsch" % "0.1.54",
"org.apache.httpcomponents.client5" % "httpclient5" % "5.1.3",
"org.apache.httpcomponents" % "httpmime" % "4.5.13",
"com.linkedin.isolation-forest" %% "isolation-forest_3.5.0" % "3.0.5"
exclude("com.google.protobuf", "protobuf-java") exclude("org.apache.spark", "spark-mllib_2.12")
exclude("org.apache.spark", "spark-core_2.12") exclude("org.apache.spark", "spark-avro_2.12")
exclude("org.apache.spark", "spark-sql_2.12"),
).map(d => d excludeAll (excludes: _*))
val dependencies = coreDependencies ++ extraDependencies
def txt(e: Elem, label: String): String = "\"" + e.child.filter(_.label == label).flatMap(_.text).mkString + "\""
val omittedDeps = Set(s"spark-core_$scalaMajorVersion", s"spark-mllib_$scalaMajorVersion", "org.scala-lang")
// skip dependency elements with a scope
def pomPostFunc(node: XmlNode): scala.xml.Node = {
new RuleTransformer(new RewriteRule {
override def transform(node: XmlNode): XmlNodeSeq = node match {
case e: Elem if e.label == "extraDependencyAttributes" =>
Comment("Removed Dependency Attributes")
case e: Elem if e.label == "dependency"
&& e.child.exists(child => child.label == "scope") =>
Comment(
s""" scoped dependency ${txt(e, "groupId")} % ${txt(e, "artifactId")}
|% ${txt(e, "version")} % ${txt(e, "scope")} has been omitted """.stripMargin)
case e: Elem if e.label == "dependency"
&& e.child.exists(child => omittedDeps(child.text)) =>
Comment(
s""" excluded dependency ${txt(e, "groupId")} % ${txt(e, "artifactId")}
|% ${txt(e, "version")} has been omitted """.stripMargin)
case _ => node
}
}).transform(node).head
}
pomPostProcess := pomPostFunc
val getDatasetsTask = TaskKey[Unit]("getDatasets", "download datasets used for testing")
val datasetName = "datasets-2023-04-03.tgz"
val datasetUrl = new URI(s"https://mmlspark.blob.core.windows.net/installers/$datasetName").toURL
val datasetDir = settingKey[File]("The directory that holds the dataset")
ThisBuild / datasetDir := {
join((Compile / packageBin / artifactPath).value.getParentFile,
"datasets", datasetName.split(".".toCharArray.head).head)
}
getDatasetsTask := {
val d = datasetDir.value.getParentFile
val f = new File(d, datasetName)
if (!d.exists()) d.mkdirs()
if (!f.exists()) {
FileUtils.copyURLToFile(datasetUrl, f)
UnzipUtils.unzip(f, d)
}
}
val genBuildInfo = TaskKey[Unit]("genBuildInfo", "generate a build info file")
genBuildInfo := {
val docInfo =
s"""
|
|### Documentation Pages:
|[Scala Documentation](https://mmlspark.blob.core.windows.net/docs/${version.value}/scala/index.html)
|[Python Documentation](https://mmlspark.blob.core.windows.net/docs/${version.value}/pyspark/index.html)
|
""".stripMargin
val buildInfo = (root / blobArtifactInfo).value + docInfo
val infoFile = join("target", "Build.md")
if (infoFile.exists()) FileUtils.forceDelete(infoFile)
FileUtils.writeStringToFile(infoFile, buildInfo, "utf-8")
}
val rootGenDir = SettingKey[File]("rootGenDir")
rootGenDir := {
val targetDir = (root / Compile / packageBin / artifactPath).value.getParentFile
join(targetDir, "generated")
}
def runTaskForAllInCompile(task: TaskKey[Unit]): Def.Initialize[Task[Seq[Unit]]] = {
task.all(ScopeFilter(
inProjects(core, deepLearning, cognitive, vw, lightgbm, opencv),
inConfigurations(Compile))
)
}
val generatePythonDoc = TaskKey[Unit]("generatePythonDoc", "Generate sphinx docs for python")
generatePythonDoc := {
runTaskForAllInCompile(installPipPackage).value
runTaskForAllInCompile(mergePyCode).value
val dir = join(rootGenDir.value, "src", "python", "synapse")
join(dir, "__init__.py").createNewFile()
join(dir, "ml", "__init__.py").createNewFile()
runCmd(activateCondaEnv ++ Seq("sphinx-apidoc", "-f", "-o", "doc", "."), dir)
runCmd(activateCondaEnv ++ Seq("sphinx-build", "-b", "html", "doc", "../../../doc/pyspark"), dir)
}
val packageSynapseML = TaskKey[Unit]("packageSynapseML", "package all projects into SynapseML")
packageSynapseML := {
def writeSetupFileToTarget(dir: File): Unit = {
if (!dir.exists()) {
dir.mkdir()
}
val content =
s"""
|# Copyright (C) Microsoft Corporation. All rights reserved.
|# Licensed under the MIT License. See LICENSE in project root for information.
|
|import os
|from setuptools import setup, find_namespace_packages
|import codecs
|import os.path
|
|setup(
| name="synapseml",
| version="${pythonizedVersion(version.value)}",
| description="Synapse Machine Learning",
| long_description="SynapseML contains Microsoft's open source "
| + "contributions to the Apache Spark ecosystem",
| license="MIT",
| packages=find_namespace_packages(include=['synapse.ml.*']),
| url="https://github.com/Microsoft/SynapseML",
| author="Microsoft",
| author_email="synapseml-support@microsoft.com",
| classifiers=[
| "Development Status :: 4 - Beta",
| "Intended Audience :: Developers",
| "Intended Audience :: Science/Research",
| "Topic :: Software Development :: Libraries",
| "License :: OSI Approved :: MIT License",
| "Programming Language :: Python :: 2",
| "Programming Language :: Python :: 3",
| ],
| zip_safe=True,
| package_data={"synapseml": ["../LICENSE.txt", "../README.txt"]},
|)
|
|""".stripMargin
IO.write(join(dir, "setup.py"), content)
}
Def.sequential(
runTaskForAllInCompile(packagePython),
runTaskForAllInCompile(mergePyCode)
).value
val targetDir = rootGenDir.value
val dir = join(targetDir, "src", "python")
val packageDir = join(targetDir, "package", "python").absolutePath
writeSetupFileToTarget(dir)
packagePythonWheelCmd(packageDir, dir)
}
val publishPypi = TaskKey[Unit]("publishPypi", "publish synapseml python wheel to pypi")
publishPypi := {
packageSynapseML.value
val fn = s"${name.value}-${pythonizedVersion(version.value)}-py2.py3-none-any.whl"
runCmd(
activateCondaEnv ++
Seq("twine", "upload", "--skip-existing",
join(rootGenDir.value, "package", "python", fn).toString,
"--username", "__token__", "--password", Secrets.pypiApiToken, "--verbose")
)
}
val publishDocs = TaskKey[Unit]("publishDocs", "publish docs for scala and python")
publishDocs := {
Def.sequential(
generatePythonDoc,
(root / Compile / unidoc)
).value
val html =
"""
|<html><body><pre style="font-size: 150%;">
|<a href="pyspark/index.html">pyspark/</u>
|<a href="scala/index.html">scala/</u>
|</pre></body></html>
""".stripMargin
val targetDir = (root / Compile / packageBin / artifactPath).value.getParentFile
val codegenDir = join(targetDir, "generated")
val unifiedDocDir = join(codegenDir, "doc")
val scalaDir = join(unifiedDocDir.toString, "scala")
if (scalaDir.exists()) FileUtils.forceDelete(scalaDir)
FileUtils.copyDirectory(join(targetDir, "unidoc"), scalaDir)
FileUtils.writeStringToFile(join(unifiedDocDir.toString, "index.html"), html, "utf-8")
uploadToBlob(unifiedDocDir.toString, version.value, "docs")
}
val publishBadges = TaskKey[Unit]("publishBadges", "publish badges to synapseml blob")
publishBadges := {
def enc(s: String): String = {
s.replaceAllLiterally("_", "__").replaceAllLiterally(" ", "_").replaceAllLiterally("-", "--")
}
def uploadBadge(left: String, right: String, color: String, filename: String): Unit = {
val badgeDir = join(baseDirectory.value.toString, "target", "badges")
if (!badgeDir.exists()) badgeDir.mkdirs()
runCmd(Seq("curl",
"-o", join(badgeDir.toString, filename).toString,
s"https://img.shields.io/badge/${enc(left)}-${enc(right)}-${enc(color)}"))
singleUploadToBlob(
join(badgeDir.toString, filename).toString,
s"badges/$filename", "icons",
extraArgs = Seq("--content-cache-control", "no-cache", "--content-type", "image/svg+xml"))
}
uploadBadge("master version", version.value, "blue", "master_version3.svg")
}
val uploadNotebooks = TaskKey[Unit]("uploadNotebooks", "upload docs to blob storage")
uploadNotebooks := {
val localNotebooksFolder = join(baseDirectory.value.toString, "docs").toString
val blobNotebooksFolder = version.value
uploadToBlob(localNotebooksFolder, blobNotebooksFolder, "docs")
}
val settings = Seq(
Test / scalastyleConfig := (ThisBuild / baseDirectory).value / "scalastyle-test-config.xml",
Test / logBuffered := false,
Test / parallelExecution := false,
Test / publishArtifact := true,
assembly / test := {},
assembly / assemblyMergeStrategy := {
case PathList("META-INF", xs@_*) => MergeStrategy.discard
case x => MergeStrategy.first
},
assembly / assemblyOption := (assembly / assemblyOption).value.copy(includeScala = false),
autoAPIMappings := true,
pomPostProcess := pomPostFunc,
sbtPlugin := false
)
ThisBuild / publishMavenStyle := true
lazy val core = (project in file("core"))
.enablePlugins(BuildInfoPlugin)
.settings(settings ++ Seq(
libraryDependencies ++= dependencies,
buildInfoKeys ++= Seq[BuildInfoKey](
datasetDir,
version,
scalaVersion,
sbtVersion,
baseDirectory
),
name := "synapseml-core",
buildInfoPackage := "com.microsoft.azure.synapse.ml.build"
): _*)
lazy val deepLearning = (project in file("deep-learning"))
.dependsOn(core % "test->test;compile->compile", opencv % "test->test;compile->compile")
.settings(settings ++ Seq(
libraryDependencies ++= Seq(
"com.microsoft.azure" % "onnx-protobuf_2.12" % "0.9.3",
"com.microsoft.onnxruntime" % "onnxruntime_gpu" % "1.8.1",
"org.apache.hadoop" % "hadoop-common" % "3.3.4" % "test",
"org.apache.hadoop" % "hadoop-azure" % "3.3.4" % "test",
),
name := "synapseml-deep-learning"
): _*)
lazy val lightgbm = (project in file("lightgbm"))
.dependsOn(core % "test->test;compile->compile")
.settings(settings ++ Seq(
libraryDependencies += ("com.microsoft.ml.lightgbm" % "lightgbmlib" % "3.3.510"),
name := "synapseml-lightgbm"
): _*)
lazy val vw = (project in file("vw"))
.dependsOn(core % "test->test;compile->compile")
.settings(settings ++ Seq(
libraryDependencies += ("com.github.vowpalwabbit" % "vw-jni" % "9.3.0"),
name := "synapseml-vw"
): _*)
lazy val cognitive = (project in file("cognitive"))
.dependsOn(core % "test->test;compile->compile")
.settings(settings ++ Seq(
libraryDependencies ++= Seq(
"com.microsoft.cognitiveservices.speech" % "client-sdk" % "1.24.1",
"org.apache.hadoop" % "hadoop-common" % "3.3.4" % "test",
"org.apache.hadoop" % "hadoop-azure" % "3.3.4" % "test",
),
name := "synapseml-cognitive"
): _*)
lazy val opencv = (project in file("opencv"))
.dependsOn(core % "test->test;compile->compile")
.settings(settings ++ Seq(
libraryDependencies += ("org.openpnp" % "opencv" % "3.2.0-1"),
name := "synapseml-opencv"
): _*)
lazy val root = (project in file("."))
.aggregate(core, deepLearning, cognitive, vw, lightgbm, opencv)
.dependsOn(
core % "test->test;compile->compile",
deepLearning % "test->test;compile->compile",
cognitive % "test->test;compile->compile",
vw % "test->test;compile->compile",
lightgbm % "test->test;compile->compile",
opencv % "test->test;compile->compile")
.enablePlugins(ScalaUnidocPlugin)
.disablePlugins(CodegenPlugin)
.settings(settings ++ Seq(
name := "synapseml",
ThisBuild / credentials += Credentials(
"",
"msdata.pkgs.visualstudio.com",
"msdata", Secrets.adoFeedToken),
ThisBuild / useCoursier := false
))
val setupTask = TaskKey[Unit]("setup", "set up library for intellij")
setupTask := {
compile.all(ScopeFilter(
inProjects(root, core, deepLearning, cognitive, vw, lightgbm, opencv),
inConfigurations(Compile, Test))
).value
getDatasetsTask.value
}
val convertNotebooks = TaskKey[Unit]("convertNotebooks", "convert notebooks to markdown for website display")
convertNotebooks := {
runCmd(Seq("pip", "install", "-e", "."), wd=join(baseDirectory.value, "tools/docgen"))
runCmd(Seq("python", "__main__.py"), wd=join(baseDirectory.value, "tools/docgen/docgen"))
}
val testWebsiteDocs = TaskKey[Unit]("testWebsiteDocs",
"test code blocks inside markdowns under folder website/docs/documentation")
testWebsiteDocs := {
runCmd(
Seq("python", s"${join(baseDirectory.value, "website/doctest.py")}", version.value)
)
}
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Scala
1
https://gitee.com/mirrors/MMLSpark.git
git@gitee.com:mirrors/MMLSpark.git
mirrors
MMLSpark
MMLSpark
master

搜索帮助