Skip to content

Commit d526819

Browse files
authored
Databricks: Add support for OPTIMIZE, PARTITIONED BY, and STRUCT (apache#2170)
1 parent d0a0b3e commit d526819

File tree

4 files changed

+290
-11
lines changed

4 files changed

+290
-11
lines changed

src/ast/mod.rs

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4578,22 +4578,40 @@ pub enum Statement {
45784578
/// Legacy copy-style options.
45794579
options: Vec<CopyLegacyOption>,
45804580
},
4581+
/// ClickHouse:
45814582
/// ```sql
45824583
/// OPTIMIZE TABLE [db.]name [ON CLUSTER cluster] [PARTITION partition | PARTITION ID 'partition_id'] [FINAL] [DEDUPLICATE [BY expression]]
45834584
/// ```
4584-
///
45854585
/// See ClickHouse <https://clickhouse.com/docs/en/sql-reference/statements/optimize>
4586+
///
4587+
/// Databricks:
4588+
/// ```sql
4589+
/// OPTIMIZE table_name [WHERE predicate] [ZORDER BY (col_name1 [, ...])]
4590+
/// ```
4591+
/// See Databricks <https://docs.databricks.com/en/sql/language-manual/delta-optimize.html>
45864592
OptimizeTable {
45874593
/// Table name to optimize.
45884594
name: ObjectName,
4595+
/// Whether the `TABLE` keyword was present (ClickHouse uses `OPTIMIZE TABLE`, Databricks uses `OPTIMIZE`).
4596+
has_table_keyword: bool,
45894597
/// Optional cluster identifier.
4598+
/// [ClickHouse](https://clickhouse.com/docs/en/sql-reference/statements/optimize)
45904599
on_cluster: Option<Ident>,
45914600
/// Optional partition spec.
4601+
/// [ClickHouse](https://clickhouse.com/docs/en/sql-reference/statements/optimize)
45924602
partition: Option<Partition>,
45934603
/// Whether `FINAL` was specified.
4604+
/// [ClickHouse](https://clickhouse.com/docs/en/sql-reference/statements/optimize)
45944605
include_final: bool,
45954606
/// Optional deduplication settings.
4607+
/// [ClickHouse](https://clickhouse.com/docs/en/sql-reference/statements/optimize)
45964608
deduplicate: Option<Deduplicate>,
4609+
/// Optional WHERE predicate.
4610+
/// [Databricks](https://docs.databricks.com/en/sql/language-manual/delta-optimize.html)
4611+
predicate: Option<Expr>,
4612+
/// Optional ZORDER BY columns.
4613+
/// [Databricks](https://docs.databricks.com/en/sql/language-manual/delta-optimize.html)
4614+
zorder: Option<Vec<Expr>>,
45974615
},
45984616
/// ```sql
45994617
/// LISTEN
@@ -6069,12 +6087,19 @@ impl fmt::Display for Statement {
60696087
}
60706088
Statement::OptimizeTable {
60716089
name,
6090+
has_table_keyword,
60726091
on_cluster,
60736092
partition,
60746093
include_final,
60756094
deduplicate,
6095+
predicate,
6096+
zorder,
60766097
} => {
6077-
write!(f, "OPTIMIZE TABLE {name}")?;
6098+
write!(f, "OPTIMIZE")?;
6099+
if *has_table_keyword {
6100+
write!(f, " TABLE")?;
6101+
}
6102+
write!(f, " {name}")?;
60786103
if let Some(on_cluster) = on_cluster {
60796104
write!(f, " ON CLUSTER {on_cluster}")?;
60806105
}
@@ -6087,6 +6112,12 @@ impl fmt::Display for Statement {
60876112
if let Some(deduplicate) = deduplicate {
60886113
write!(f, " {deduplicate}")?;
60896114
}
6115+
if let Some(predicate) = predicate {
6116+
write!(f, " WHERE {predicate}")?;
6117+
}
6118+
if let Some(zorder) = zorder {
6119+
write!(f, " ZORDER BY ({})", display_comma_separated(zorder))?;
6120+
}
60906121
Ok(())
60916122
}
60926123
Statement::LISTEN { channel } => {

src/dialect/databricks.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,4 +85,9 @@ impl Dialect for DatabricksDialect {
8585
fn supports_values_as_table_factor(&self) -> bool {
8686
true
8787
}
88+
89+
/// See <https://docs.databricks.com/en/sql/language-manual/delta-optimize.html>
90+
fn supports_optimize_table(&self) -> bool {
91+
true
92+
}
8893
}

src/parser/mod.rs

Lines changed: 54 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -693,7 +693,6 @@ impl<'a> Parser<'a> {
693693
// `INSTALL` is duckdb specific https://duckdb.org/docs/extensions/overview
694694
Keyword::INSTALL if self.dialect.supports_install() => self.parse_install(),
695695
Keyword::LOAD => self.parse_load(),
696-
// `OPTIMIZE` is clickhouse specific https://clickhouse.tech/docs/en/sql-reference/statements/optimize/
697696
Keyword::OPTIMIZE if self.dialect.supports_optimize_table() => {
698697
self.parse_optimize_table()
699698
}
@@ -3382,24 +3381,28 @@ impl<'a> Parser<'a> {
33823381
///
33833382
/// ```sql
33843383
/// [field_name] field_type
3384+
/// field_name: field_type
33853385
/// ```
33863386
///
33873387
/// [struct]: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#declaring_a_struct_type
33883388
/// [tuple]: https://clickhouse.com/docs/en/sql-reference/data-types/tuple
3389+
/// [databricks]: https://docs.databricks.com/en/sql/language-manual/data-types/struct-type.html
33893390
fn parse_struct_field_def(
33903391
&mut self,
33913392
) -> Result<(StructField, MatchedTrailingBracket), ParserError> {
33923393
// Look beyond the next item to infer whether both field name
33933394
// and type are specified.
3394-
let is_anonymous_field = !matches!(
3395+
let is_named_field = matches!(
33953396
(self.peek_nth_token(0).token, self.peek_nth_token(1).token),
3396-
(Token::Word(_), Token::Word(_))
3397+
(Token::Word(_), Token::Word(_)) | (Token::Word(_), Token::Colon)
33973398
);
33983399

3399-
let field_name = if is_anonymous_field {
3400-
None
3400+
let field_name = if is_named_field {
3401+
let name = self.parse_identifier()?;
3402+
let _ = self.consume_token(&Token::Colon);
3403+
Some(name)
34013404
} else {
3402-
Some(self.parse_identifier()?)
3405+
None
34033406
};
34043407

34053408
let (field_type, trailing_bracket) = self.parse_data_type_helper()?;
@@ -7985,7 +7988,8 @@ impl<'a> Parser<'a> {
79857988
pub fn parse_hive_distribution(&mut self) -> Result<HiveDistributionStyle, ParserError> {
79867989
if self.parse_keywords(&[Keyword::PARTITIONED, Keyword::BY]) {
79877990
self.expect_token(&Token::LParen)?;
7988-
let columns = self.parse_comma_separated(Parser::parse_column_def)?;
7991+
let columns =
7992+
self.parse_comma_separated(|parser| parser.parse_column_def_inner(true))?;
79897993
self.expect_token(&Token::RParen)?;
79907994
Ok(HiveDistributionStyle::PARTITIONED { columns })
79917995
} else {
@@ -8809,9 +8813,19 @@ impl<'a> Parser<'a> {
88098813

88108814
/// Parse column definition.
88118815
pub fn parse_column_def(&mut self) -> Result<ColumnDef, ParserError> {
8816+
self.parse_column_def_inner(false)
8817+
}
8818+
8819+
fn parse_column_def_inner(
8820+
&mut self,
8821+
optional_data_type: bool,
8822+
) -> Result<ColumnDef, ParserError> {
88128823
let col_name = self.parse_identifier()?;
88138824
let data_type = if self.is_column_type_sqlite_unspecified() {
88148825
DataType::Unspecified
8826+
} else if optional_data_type {
8827+
self.maybe_parse(|parser| parser.parse_data_type())?
8828+
.unwrap_or(DataType::Unspecified)
88158829
} else {
88168830
self.parse_data_type()?
88178831
};
@@ -11917,7 +11931,8 @@ impl<'a> Parser<'a> {
1191711931
let field_defs = self.parse_duckdb_struct_type_def()?;
1191811932
Ok(DataType::Struct(field_defs, StructBracketKind::Parentheses))
1191911933
}
11920-
Keyword::STRUCT if dialect_is!(dialect is BigQueryDialect | GenericDialect) => {
11934+
Keyword::STRUCT if dialect_is!(dialect is BigQueryDialect | DatabricksDialect | GenericDialect) =>
11935+
{
1192111936
self.prev_token();
1192211937
let (field_defs, _trailing_bracket) =
1192311938
self.parse_struct_type_def(Self::parse_struct_field_def)?;
@@ -18480,13 +18495,23 @@ impl<'a> Parser<'a> {
1848018495
}
1848118496
}
1848218497

18498+
/// ClickHouse:
1848318499
/// ```sql
1848418500
/// OPTIMIZE TABLE [db.]name [ON CLUSTER cluster] [PARTITION partition | PARTITION ID 'partition_id'] [FINAL] [DEDUPLICATE [BY expression]]
1848518501
/// ```
1848618502
/// [ClickHouse](https://clickhouse.com/docs/en/sql-reference/statements/optimize)
18503+
///
18504+
/// Databricks:
18505+
/// ```sql
18506+
/// OPTIMIZE table_name [WHERE predicate] [ZORDER BY (col_name1 [, ...])]
18507+
/// ```
18508+
/// [Databricks](https://docs.databricks.com/en/sql/language-manual/delta-optimize.html)
1848718509
pub fn parse_optimize_table(&mut self) -> Result<Statement, ParserError> {
18488-
self.expect_keyword_is(Keyword::TABLE)?;
18510+
let has_table_keyword = self.parse_keyword(Keyword::TABLE);
18511+
1848918512
let name = self.parse_object_name(false)?;
18513+
18514+
// ClickHouse-specific options
1849018515
let on_cluster = self.parse_optional_on_cluster()?;
1849118516

1849218517
let partition = if self.parse_keyword(Keyword::PARTITION) {
@@ -18500,6 +18525,7 @@ impl<'a> Parser<'a> {
1850018525
};
1850118526

1850218527
let include_final = self.parse_keyword(Keyword::FINAL);
18528+
1850318529
let deduplicate = if self.parse_keyword(Keyword::DEDUPLICATE) {
1850418530
if self.parse_keyword(Keyword::BY) {
1850518531
Some(Deduplicate::ByExpression(self.parse_expr()?))
@@ -18510,12 +18536,31 @@ impl<'a> Parser<'a> {
1851018536
None
1851118537
};
1851218538

18539+
// Databricks-specific options
18540+
let predicate = if self.parse_keyword(Keyword::WHERE) {
18541+
Some(self.parse_expr()?)
18542+
} else {
18543+
None
18544+
};
18545+
18546+
let zorder = if self.parse_keywords(&[Keyword::ZORDER, Keyword::BY]) {
18547+
self.expect_token(&Token::LParen)?;
18548+
let columns = self.parse_comma_separated(|p| p.parse_expr())?;
18549+
self.expect_token(&Token::RParen)?;
18550+
Some(columns)
18551+
} else {
18552+
None
18553+
};
18554+
1851318555
Ok(Statement::OptimizeTable {
1851418556
name,
18557+
has_table_keyword,
1851518558
on_cluster,
1851618559
partition,
1851718560
include_final,
1851818561
deduplicate,
18562+
predicate,
18563+
zorder,
1851918564
})
1852018565
}
1852118566

0 commit comments

Comments
 (0)