forked from apache/beam
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbeam-sql.sh
More file actions
executable file
·448 lines (397 loc) · 17.8 KB
/
beam-sql.sh
File metadata and controls
executable file
·448 lines (397 loc) · 17.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
#!/bin/bash
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# A simple launcher for the Apache Beam SQL Shell.
# This script builds a self-contained JAR with all dependencies using Maven,
# which correctly handles service loading for IOs, and caches the JAR.
set -e # Exit immediately if a command exits with a non-zero status.
# --- Configuration ---
DEFAULT_BEAM_VERSION="2.69.0"
MAIN_CLASS="org.apache.beam.sdk.extensions.sql.jdbc.BeamSqlLine"
# Directory to store cached executable JAR files
CACHE_DIR="${HOME}/.beam/cache"
# Maven Wrapper Configuration
MAVEN_WRAPPER_VERSION="3.2.0"
MAVEN_VERSION="3.9.6"
MAVEN_WRAPPER_SCRIPT_URL="https://raw.githubusercontent.com/apache/maven-wrapper/refs/tags/maven-wrapper-${MAVEN_WRAPPER_VERSION}/maven-wrapper-distribution/src/resources/mvnw"
MAVEN_WRAPPER_JAR_URL="https://repo.maven.apache.org/maven2/org/apache/maven/wrapper/maven-wrapper/${MAVEN_WRAPPER_VERSION}/maven-wrapper-${MAVEN_WRAPPER_VERSION}.jar"
MAVEN_DISTRIBUTION_URL="https://repo.maven.apache.org/maven2/org/apache/maven/apache-maven/${MAVEN_VERSION}/apache-maven-${MAVEN_VERSION}-bin.zip"
# Maven Plugin Configuration
MAVEN_SHADE_PLUGIN_VERSION="3.5.1"
mkdir -p "${CACHE_DIR}"
# Create a temporary directory for our Maven project.
WORK_DIR=$(mktemp -d)
# Ensure cleanup on script exit
cleanup() {
if [ -n "${WORK_DIR}" ] && [ -d "${WORK_DIR}" ]; then
rm -rf "${WORK_DIR}"
fi
}
trap cleanup EXIT
# --- Helper Functions ---
# This function downloads the maven wrapper script and supporting files.
function setup_maven_wrapper() {
local beam_dir="${HOME}/.beam"
local maven_wrapper_dir="${beam_dir}/maven-wrapper"
local mvnw_script="${maven_wrapper_dir}/mvnw"
local wrapper_jar="${maven_wrapper_dir}/.mvn/wrapper/maven-wrapper.jar"
local wrapper_props="${maven_wrapper_dir}/.mvn/wrapper/maven-wrapper.properties"
# Check if Maven wrapper is already cached
if [ -f "${mvnw_script}" ] && [ -f "${wrapper_jar}" ] && [ -f "${wrapper_props}" ]; then
echo "🔧 Using cached Maven Wrapper from ${maven_wrapper_dir}"
# Use the cached wrapper directly
MAVEN_CMD="${mvnw_script}"
return
fi
echo "🔧 Downloading Maven Wrapper for the first time..."
mkdir -p "${maven_wrapper_dir}/.mvn/wrapper"
# Create the properties file to specify a modern Maven version
echo "distributionUrl=${MAVEN_DISTRIBUTION_URL}" > "${wrapper_props}"
# Download the mvnw script and the wrapper JAR to cache directory
curl -sSL -o "${mvnw_script}" "${MAVEN_WRAPPER_SCRIPT_URL}"
curl -sSL -o "${wrapper_jar}" "${MAVEN_WRAPPER_JAR_URL}"
# Make the wrapper script executable
chmod +x "${mvnw_script}"
echo "✅ Maven Wrapper cached in ${maven_wrapper_dir} for future use"
# Use the cached wrapper directly
MAVEN_CMD="${mvnw_script}"
}
function usage() {
echo "Usage: $0 [--version <beam_version>] [--runner <runner_name>] [--io <io_connector>] [--list-versions] [--list-ios] [--list-runners] [--debug] [-h|--help]"
echo ""
echo "A self-contained launcher for the Apache Beam SQL Shell."
echo ""
echo "Options:"
echo " --version Specify the Apache Beam version (default: ${DEFAULT_BEAM_VERSION})."
echo " --runner Specify the Beam runner to use (default: direct)."
echo " Supported runners:"
echo " direct - DirectRunner (runs locally, good for development)"
echo " dataflow - DataflowRunner (runs on Google Cloud Dataflow)"
echo " --io Specify an IO connector to include. Can be used multiple times."
echo " Available connectors: amazon-web-services2, amqp, azure,"
echo " azure-cosmos, cassandra, cdap, clickhouse, csv, debezium, elasticsearch,"
echo " google-ads, google-cloud-platform, hadoop-format, hbase, hcatalog, iceberg,"
echo " influxdb, jdbc, jms, json, kafka, kinesis, kudu, mongodb, mqtt, neo4j,"
echo " parquet, pulsar, rabbitmq, redis, singlestore, snowflake, solace, solr,"
echo " sparkreceiver, splunk, synthetic, thrift, tika, xml"
echo " --list-versions List all available Beam versions from Maven Central and exit."
echo " --list-ios List all available IO connectors from Maven Central and exit."
echo " --list-runners List all available runners and exit."
echo " --debug Enable debug mode (sets bash -x flag)."
echo " -h, --help Show this help message."
exit 1
}
# This function fetches all available Beam versions from Maven Central.
function list_versions() {
echo "🔎 Fetching the 10 most recent Apache Beam versions from Maven Central..."
local metadata_url="https://repo1.maven.org/maven2/org/apache/beam/beam-sdks-java-core/maven-metadata.xml"
if ! command -v curl &> /dev/null; then
echo "❌ Error: 'curl' is required to fetch the version list." >&2
return 1
fi
# Fetch, parse, filter, sort, and take the top 10.
local versions
versions=$(curl -sS "${metadata_url}" | \
grep '<version>' | \
sed 's/.*<version>\(.*\)<\/version>.*/\1/' | \
grep -v 'SNAPSHOT' | \
sort -rV | \
head -n 10) # Limit to the first 10 lines
if [ -z "${versions}" ]; then
echo "❌ Could not retrieve versions. Please check your internet connection or the Maven Central status." >&2
return 1
fi
echo "✅ 10 latest versions:"
echo "${versions}"
}
# This function lists all available IO connectors by querying Maven Central.
function list_ios() {
echo "🔎 Fetching available Apache Beam IO connectors from Maven Central..."
local search_url="https://search.maven.org/solrsearch/select?q=g:org.apache.beam+AND+a:beam-sdks-java-io-*&rows=100&wt=json"
if ! command -v curl &> /dev/null; then
echo "❌ Error: 'curl' is required to fetch the IO connector list." >&2
return 1
fi
# Fetch and parse the JSON response to extract IO connector names
local ios
ios=$(curl -sS "${search_url}" | \
grep -o '"a":"beam-sdks-java-io-[^"]*"' | \
sed 's/"a":"beam-sdks-java-io-\([^"]*\)"/\1/' | \
grep -v -E '(tests?|expansion-service|parent|upgrade)' | \
sort -u)
if [ -z "${ios}" ]; then
echo "❌ Could not retrieve IO connectors. Please check your internet connection or try again later." >&2
echo "📋 Here are the known IO connectors (may not be complete):"
echo "amazon-web-services2, amqp, azure, azure-cosmos, cassandra,"
echo "cdap, clickhouse, csv, debezium, elasticsearch, google-ads, google-cloud-platform,"
echo "hadoop-format, hbase, hcatalog, iceberg, influxdb, jdbc, jms, json, kafka, kinesis,"
echo "kudu, mongodb, mqtt, neo4j, parquet, pulsar, rabbitmq, redis, singlestore, snowflake,"
echo "solace, solr, sparkreceiver, splunk, synthetic, thrift, tika, xml"
return 1
fi
echo "✅ Available IO connectors:"
echo "${ios}" | tr '\n' ' ' | fold -s -w 80 | sed 's/^/ /'
}
# This function lists all available runners by querying Maven Central.
function list_runners() {
echo "🚀 Fetching available Apache Beam runners for version ${BEAM_VERSION} from Maven Central..."
local search_url="https://search.maven.org/solrsearch/select?q=g:org.apache.beam+AND+a:beam-runners-*+AND+v:${BEAM_VERSION}&rows=100&wt=json"
if ! command -v curl &> /dev/null; then
echo "❌ Error: 'curl' is required to fetch the runner list." >&2
return 1
fi
# Fetch and parse the JSON response to extract runner names
local runners
runners=$(curl -sS "${search_url}" | \
grep -o '"a":"beam-runners-[^"]*"' | \
sed 's/"a":"beam-runners-\([^"]*\)"/\1/' | \
grep -v -E '(tests?|parent|core-construction|core-java|extensions|job-server|legacy-worker|windmill|examples|experimental|orchestrator|java-fn-execution|java-job-service|gcp-gcemd|gcp-gcsproxy|local-java-core|portability-java|prism-java|reference-java)' | \
sort -u)
if [ -z "${runners}" ]; then
echo "❌ Could not retrieve runners for version ${BEAM_VERSION}. Please check your internet connection or try again later." >&2
echo "📋 Here are the known runners for recent Beam versions (may not be complete):"
echo ""
echo " direct - DirectRunner (runs locally, good for development)"
echo " dataflow - DataflowRunner (runs on Google Cloud Dataflow)"
echo " flink - FlinkRunner (runs on Apache Flink)"
echo " spark - SparkRunner (runs on Apache Spark)"
echo " samza - SamzaRunner (runs on Apache Samza)"
echo " jet - JetRunner (runs on Hazelcast Jet)"
echo " twister2 - Twister2Runner (runs on Twister2)"
echo ""
echo "💡 Usage: ./beam-sql.sh --runner <runner_name>"
echo " Default: direct"
echo " Note: Only 'direct' and 'dataflow' are currently supported by this script."
return 1
fi
echo "✅ Available runners for Beam ${BEAM_VERSION}:"
echo ""
# Process each runner and provide descriptions
while IFS= read -r runner; do
case "$runner" in
"direct-java")
echo " direct - DirectRunner"
echo " Runs locally on your machine. Good for development and testing."
;;
"google-cloud-dataflow-java")
echo " dataflow - DataflowRunner"
echo " Runs on Google Cloud Dataflow for production workloads."
;;
flink-*)
local version=$(echo "$runner" | sed 's/flink-//')
echo " flink-${version} - FlinkRunner (Flink ${version})"
echo " Runs on Apache Flink ${version} clusters."
;;
flink_*)
local version=$(echo "$runner" | sed 's/flink_//')
echo " flink-${version} - FlinkRunner (Flink ${version})"
echo " Runs on Apache Flink ${version} clusters."
;;
"spark")
echo " spark - SparkRunner"
echo " Runs on Apache Spark clusters."
;;
"spark-3")
echo " spark-3 - SparkRunner (Spark 3.x)"
echo " Runs on Apache Spark 3.x clusters."
;;
"samza")
echo " samza - SamzaRunner"
echo " Runs on Apache Samza."
;;
"jet")
echo " jet - JetRunner"
echo " Runs on Hazelcast Jet."
;;
"twister2")
echo " twister2 - Twister2Runner"
echo " Runs on Twister2."
;;
"apex")
echo " apex - ApexRunner"
echo " Runs on Apache Apex."
;;
"gearpump")
echo " gearpump - GearpumpRunner"
echo " Runs on Apache Gearpump."
;;
"prism")
echo " prism - PrismRunner"
echo " Local runner for testing portable pipelines."
;;
"reference")
echo " reference - ReferenceRunner"
echo " Reference implementation for testing."
;;
"portability")
echo " portability - PortabilityRunner"
echo " For portable pipeline execution."
;;
*)
# For any other runners, clean up the name and show it
local clean_name=$(echo "$runner" | sed -e 's/-java$//' -e 's/^gcp-//' -e 's/^local-//')
echo " ${clean_name} - ${runner}"
;;
esac
done <<< "$runners"
echo ""
echo "💡 Usage: ./beam-sql.sh --runner <runner_name>"
echo " Default: direct"
echo " Note: This script currently supports 'direct' and 'dataflow' runners."
echo " Other runners may require additional setup and dependencies."
}
# --- Argument Parsing ---
BEAM_VERSION="${DEFAULT_BEAM_VERSION}"
IO_CONNECTORS=()
BEAM_RUNNER="direct"
SQLLINE_ARGS=()
DEBUG_MODE=false
while [[ "$#" -gt 0 ]]; do
case $1 in
--version) BEAM_VERSION="$2"; shift ;;
--runner) BEAM_RUNNER=$(echo "$2" | tr '[:upper:]' '[:lower:]'); shift ;;
--io) IO_CONNECTORS+=("$2"); shift ;;
--list-versions) list_versions; exit 0 ;;
--list-ios) list_ios; exit 0 ;;
--list-runners) list_runners; exit 0 ;;
--debug) DEBUG_MODE=true ;;
-h|--help) usage ;;
*) SQLLINE_ARGS+=("$1") ;;
esac
shift
done
# Enable debug mode if requested
if [ "${DEBUG_MODE}" = true ]; then
set -x
fi
# --- Prerequisite Check ---
# Java is always required.
if ! command -v java &> /dev/null; then
echo "❌ Error: 'java' command not found. It is required to run the application." >&2
exit 1
fi
# Curl is required for Maven wrapper setup.
if ! command -v curl &> /dev/null; then
echo "❌ Error: 'curl' command not found. It is required to download the Maven wrapper." >&2
exit 1
fi
setup_maven_wrapper
echo "🚀 Preparing Beam SQL Shell v${BEAM_VERSION}..."
echo " Runner: ${BEAM_RUNNER}"
if [ ${#IO_CONNECTORS[@]} -gt 0 ]; then
echo " Including IOs: ${IO_CONNECTORS[*]}"
fi
# --- Dependency Resolution & JAR Caching ---
# Create a unique key for the configuration to use as a cache filename.
sorted_ios_str=$(printf "%s\n" "${IO_CONNECTORS[@]}" | sort | tr '\n' '-' | sed 's/-$//')
CACHE_KEY="beam-${BEAM_VERSION}_runner-${BEAM_RUNNER}_ios-${sorted_ios_str}.jar"
CACHE_FILE="${CACHE_DIR}/${CACHE_KEY}"
# Check if a cached JAR already exists for this configuration.
if [ -f "${CACHE_FILE}" ]; then
echo "✅ Found cached executable JAR. Skipping build."
CP="${CACHE_FILE}"
else
echo "🔎 No cache found. Building executable JAR (this might take a moment on first run)..."
# --- Dynamic POM Generation ---
POM_FILE="${WORK_DIR}/pom.xml"
cat > "${POM_FILE}" << EOL
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.apache.beam</groupId>
<artifactId>beam-sql-shell-runner</artifactId>
<version>1.0</version>
<dependencies>
<dependency>
<groupId>org.apache.beam</groupId>
<artifactId>beam-sdks-java-extensions-sql-jdbc</artifactId>
<version>\${beam.version}</version>
</dependency>
EOL
# Add IO and Runner dependencies
for io in "${IO_CONNECTORS[@]}"; do
echo " <dependency><groupId>org.apache.beam</groupId><artifactId>beam-sdks-java-io-${io}</artifactId><version>\${beam.version}</version></dependency>" >> "${POM_FILE}"
done
RUNNER_ARTIFACT=""
case "${BEAM_RUNNER}" in
dataflow) RUNNER_ARTIFACT="beam-runners-google-cloud-dataflow-java" ;;
direct) ;;
*) echo "❌ Error: Unsupported runner '${BEAM_RUNNER}'." >&2; exit 1 ;;
esac
if [ -n "${RUNNER_ARTIFACT}" ]; then
echo " <dependency><groupId>org.apache.beam</groupId><artifactId>${RUNNER_ARTIFACT}</artifactId><version>\${beam.version}</version></dependency>" >> "${POM_FILE}"
fi
# Complete the POM with the build section for the maven-shade-plugin
cat >> "${POM_FILE}" << EOL
</dependencies>
<properties>
<beam.version>${BEAM_VERSION}</beam.version>
</properties>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>${MAVEN_SHADE_PLUGIN_VERSION}</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<transformers>
<transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
</transformers>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
EOL
# Use `mvn package` to build the uber JAR.
${MAVEN_CMD} -f "${POM_FILE}" -q --batch-mode package
UBER_JAR_PATH="${WORK_DIR}/target/beam-sql-shell-runner-1.0.jar"
# Check if build was successful before caching
if [ ! -f "${UBER_JAR_PATH}" ]; then
echo "❌ Maven build failed. The uber JAR was not created." >&2
exit 1
fi
# Copy the newly built JAR to our cache directory.
cp "${UBER_JAR_PATH}" "${CACHE_FILE}"
CP="${CACHE_FILE}"
echo "💾 JAR built and cached for future use."
fi
# --- Launch Shell ---
echo "✅ Dependencies ready. Launching Beam SQL Shell..."
echo "----------------------------------------------------"
java -cp "${CP}" "${MAIN_CLASS}" "${SQLLINE_ARGS[@]}"
echo "----------------------------------------------------"
echo "👋 Exited Beam SQL Shell."