profile
viewpoint
André Kohn ankoh Technical University Munich Germany

philipbecker/cpp-art 9

STL-compliant map and set container based on a C++11 Adaptive Radix Tree implementation.

ankoh/logbot 7

A logging bot for Slack written in Python

ankoh/vmlcm 7

VMware (Fusion) Linked Clones Manager

ankoh/maethrillian 3

Community-Maintained Halo Wars 2 Mod

ankoh/docker-atlassian 2

Docker-Compose file for the Atlassian Stack behind a Reverse Proxy

ankoh/dotfiles 1

This is how I work.

ankoh/mc-server 1

Mendeley-Cache server used at the TUM Chair for Applied Software Engineering

ankoh/didact 0

Halo Wars 2 Statistics

ankoh/displayplacer 0

macOS command line utility to configure multi-display resolutions and arrangements. Essentially XRandR for macOS.

Pull request review commentcwida/duckdb

Pre-filtering data in zonemaps and #1303

+#include "duckdb/execution/expression_executor.hpp"+#include "duckdb/optimizer/rule/in_clause_simplification.hpp"+#include "duckdb/planner/expression/list.hpp"+#include "duckdb/planner/expression/bound_operator_expression.hpp"++namespace duckdb {++InClauseSimplificationRule::InClauseSimplificationRule(ExpressionRewriter &rewriter) : Rule(rewriter) {+	// match on InClauseExpression that has a ConstantExpression as a check+	auto op = make_unique<InClauseExpressionMatcher>();+	op->policy = SetMatcher::Policy::SOME;+	root = move(op);+}++unique_ptr<Expression> InClauseSimplificationRule::Apply(LogicalOperator &op, vector<Expression *> &bindings,+                                                         bool &changes_made) {+	D_ASSERT(bindings[0]->expression_class == ExpressionClass::BOUND_OPERATOR);+	auto expr = (BoundOperatorExpression *)bindings[0];+	if (expr->children[0]->expression_class != ExpressionClass::BOUND_CAST) {+		return nullptr;+	}+	auto cast_expression = (BoundCastExpression *)expr->children[0].get();+	if (cast_expression->child->expression_class != ExpressionClass::BOUND_COLUMN_REF) {+		return nullptr;+	}+	//! Here we check if we can apply the expression on the constant side+	auto target_type = cast_expression->source_type();+	if (!BoundCastExpression::CastIsInvertible(target_type, cast_expression->return_type)) {+		return nullptr;+	}+	for (size_t i{1}; i < expr->children.size(); i++) {+		if (expr->children[i]->expression_class != ExpressionClass::BOUND_CONSTANT) {+			return nullptr;+		}+		D_ASSERT(expr->children[i]->IsFoldable());+		auto constant_value = ExpressionExecutor::EvaluateScalar(*expr->children[i]);+		auto new_constant = constant_value.TryCastAs(target_type);+		if (new_constant) {+			//! We can cast, so we move the new constant+			auto new_constant_expr = make_unique<BoundConstantExpression>(constant_value);+			expr->children[i] = move(new_constant_expr);

good catch

pdet

comment created time in 2 hours

fork domoritz/plugin-pyenv-1

Simple Python Version Management integration.

fork in 5 hours

fork domoritz/plugin-cd

A featured cd can make your fish journey much more pleasant.

https://github.com/sancoder-q/plugin-cd

fork in 6 hours

Pull request review commentcwida/duckdb

Pre-filtering data in zonemaps and #1303

+#include "duckdb/execution/expression_executor.hpp"+#include "duckdb/optimizer/rule/in_clause_simplification.hpp"+#include "duckdb/planner/expression/list.hpp"+#include "duckdb/planner/expression/bound_operator_expression.hpp"++namespace duckdb {++InClauseSimplificationRule::InClauseSimplificationRule(ExpressionRewriter &rewriter) : Rule(rewriter) {+	// match on InClauseExpression that has a ConstantExpression as a check+	auto op = make_unique<InClauseExpressionMatcher>();+	op->policy = SetMatcher::Policy::SOME;+	root = move(op);+}++unique_ptr<Expression> InClauseSimplificationRule::Apply(LogicalOperator &op, vector<Expression *> &bindings,+                                                         bool &changes_made) {+	D_ASSERT(bindings[0]->expression_class == ExpressionClass::BOUND_OPERATOR);+	auto expr = (BoundOperatorExpression *)bindings[0];+	if (expr->children[0]->expression_class != ExpressionClass::BOUND_CAST) {+		return nullptr;+	}+	auto cast_expression = (BoundCastExpression *)expr->children[0].get();+	if (cast_expression->child->expression_class != ExpressionClass::BOUND_COLUMN_REF) {+		return nullptr;+	}+	//! Here we check if we can apply the expression on the constant side+	auto target_type = cast_expression->source_type();+	if (!BoundCastExpression::CastIsInvertible(target_type, cast_expression->return_type)) {+		return nullptr;+	}+	for (size_t i{1}; i < expr->children.size(); i++) {+		if (expr->children[i]->expression_class != ExpressionClass::BOUND_CONSTANT) {+			return nullptr;+		}+		D_ASSERT(expr->children[i]->IsFoldable());+		auto constant_value = ExpressionExecutor::EvaluateScalar(*expr->children[i]);+		auto new_constant = constant_value.TryCastAs(target_type);+		if (new_constant) {+			//! We can cast, so we move the new constant+			auto new_constant_expr = make_unique<BoundConstantExpression>(constant_value);+			expr->children[i] = move(new_constant_expr);

Shouldn't we first check if all children can be cast before actually modifying the IN operator? What if we have e.g.

SELECT x::VARCHAR IN ('1', y) FROM (VALUES (1, 2), (2, 3)) tbl(x, y);

The first element contains an invertible cast ('1' -> 1), but the second element is not invertible. Could you add a test that verifies this does not give problems?

pdet

comment created time in 8 hours

Pull request review commentcwida/duckdb

Pre-filtering data in zonemaps and #1303

 FilterPropagateResult StatisticsPropagator::PropagateComparison(BaseStatistics & 	default: 		return FilterPropagateResult::NO_PRUNING_POSSIBLE; 	}+	switch (right.type.InternalType()) {

Any reason for adding this check? The left and right type should be identical no?

pdet

comment created time in 8 hours

Pull request review commentcwida/duckdb

Pre-filtering data in zonemaps and #1303

+#include "duckdb/execution/expression_executor.hpp"+#include "duckdb/optimizer/rule/in_clause_simplification.hpp"+#include "duckdb/planner/expression/list.hpp"+#include "duckdb/planner/expression/bound_operator_expression.hpp"++namespace duckdb {++InClauseSimplificationRule::InClauseSimplificationRule(ExpressionRewriter &rewriter) : Rule(rewriter) {+	// match on InClauseExpression that has a ConstantExpression as a check+	auto op = make_unique<InClauseExpressionMatcher>();+	op->policy = SetMatcher::Policy::SOME;+	root = move(op);+}++unique_ptr<Expression> InClauseSimplificationRule::Apply(LogicalOperator &op, vector<Expression *> &bindings,+                                                         bool &changes_made) {+	D_ASSERT(bindings[0]->expression_class == ExpressionClass::BOUND_OPERATOR);+	auto expr = (BoundOperatorExpression *)bindings[0];+	if (expr->children[0]->expression_class != ExpressionClass::BOUND_CAST) {+		return nullptr;+	}+	auto cast_expression = (BoundCastExpression *)expr->children[0].get();+	if (cast_expression->child->expression_class != ExpressionClass::BOUND_COLUMN_REF) {+		return nullptr;+	}+	//! Here we check if we can apply the expression on the constant side+	auto target_type = cast_expression->source_type();+	if (!BoundCastExpression::CastIsInvertible(target_type, cast_expression->return_type)) {+		return nullptr;+	}+	for (size_t i{1}; i < expr->children.size(); i++) {

size_t i = 1 please

pdet

comment created time in 8 hours

issue commentcwida/duckdb

Auto Increment Primary Key And/or Serial

Certainly:

echo -e '42\n43\n44' > /tmp/dummy
COPY a(b) FROM '/tmp/dummy';
SELECT * FROM a;
┌───┬────┐
│ i │ b  │
├───┼────┤
│ 1 │ 42 │
│ 2 │ 43 │
│ 3 │ 44 │
└───┴────┘
willium

comment created time in 9 hours

issue commentcwida/duckdb

Auto Increment Primary Key And/or Serial

oh neat! is there any way to use this alongside read_csv/COPY?

willium

comment created time in 9 hours

issue commentcwida/duckdb

Return empty json array in case of no results returned

This is again the SQLite shell that does this, not DuckDB

burtgulash

comment created time in 10 hours

issue commentcwida/duckdb

Regression Analysis

Two options, 1) pull those columns into R, and run lm there. 2) Implement a recursive CTE that computes the fit.

waynelapierre

comment created time in 10 hours

issue commentcwida/duckdb

Auto Increment Primary Key And/or Serial

How about using a sequence? For example

CREATE SEQUENCE seq;
CREATE TABLE a (i INTEGER DEFAULT NEXTVAL('seq'), b INTEGER);
INSERT INTO a (b) VALUES (42), (43);
SELECT * FROM a;

Result:

┌───┬────┐
│ i │ b  │
├───┼────┤
│ 1 │ 42 │
│ 2 │ 43 │
└───┴────┘
willium

comment created time in 10 hours

issue openedcwida/duckdb

Auto Increment Primary Key And/or Serial

While Auto-Incrementing ideas are more useful, common, and idiomatic is an OLTP store, they can be very useful for tracking changesets (especially for caching) in OLAP analytical tasks. Towards that end, it would be great to have the ability to specify an AUTO INCREMENT policy on a column (or something more advanced like PostgreSQLs Serial flag). While It's easy enough to do this manually with a prior COUNT(*) query, a write-lock, and bulk insert statements, the only way to add such a column when using a scanner/reader like read_csv is to add a new column and manually UPDATE into that column (thereby ~defeating the purpose of those fast import mechanisms). Thoughts?

created time in 12 hours

issue commentcwida/duckdb

mavecentral java package failed in android

It looks like the version of Java might be too old? But even if you got past that, the binary inside the jar isn't compiled for ARM so it wouldn't work anyway

Grufy

comment created time in 15 hours

issue openedcwida/duckdb

mavecentral java package failed in android

I try adopt duckdb into android via mavecentral, compilation is fine but hit error on runtime

Able to provide fix in mavecentral gradle plugin?

Module gradle dependencies { implementation 'org.duckdb:duckdb_jdbc:0.2.3' }

Example java url = "jdbc:duckdb:/sdcard/app/test.db"; (DuckDBConnection) DriverManager.getConnection(url);

Error java.lang.NoClassDefFoundError: Failed resolution of: [Ljava/nio/file/attribute/FileAttribute; at org.duckdb.DuckDBNative.<clinit>(DuckDBNative.java:32) at org.duckdb.DuckDBDatabase.<init>(DuckDBDatabase.java:22) at org.duckdb.DuckDBDriver.connect(DuckDBDriver.java:35)

created time in 15 hours

pull request commentcwida/duckdb

R package: Add ability to specify output timezone

It probably does make sense to normalize datetimes to UTC on their way in to DuckDB - I think this could be fairly straightforward to do when writing an existing R data.frame into DuckDB (i.e., using dbWriteTable), though I don't think it's currently possible when the csv is loaded directly to DuckDB (e.g., using duckdb_read_csv()), since DuckDB itself doesn't currently support that functionality. Is that right?

If that is correct, I see a potential risk in there being different behaviours for different ways of loading data into DuckDB from R. A user could initally load a csv into DuckDB using duckdb_read_csv() where the data is treated as UTC, and could then append to that table from a data.frame in R using dbWriteTable(), where the timestamps are normalized to UTC... thus ending up with a mismatch in timezones, potentially without the user realizing it...

ateucher

comment created time in 20 hours

issue commentcwida/duckdb

Regression Analysis

I mean the function with which you can regress a column on other columns. Like the lm function in R.

waynelapierre

comment created time in 21 hours

issue commentcwida/duckdb

Regression Analysis

Could you please be more specific? Which functions are you referring to?

waynelapierre

comment created time in 21 hours

issue openedcwida/duckdb

Regression Analysis

Does DuckDB provide regression analysis functions?

created time in a day

startedopenai/gym

started time in a day

issue openedcwida/duckdb

Return empty json array in case of no results returned

I'm using the -json output of the duckdb command as that allows me to send the query result straight to a client application without having to parse through it slowly in python.

The query returns empty string when there are no results, which is not consistent with the case when the query result is non-empty, in which case an array of json objects is returned.

% duckdb -json /tmp/xxx.db 'select 1 where 1=1'                   
[{"1":1}]
% duckdb -json /tmp/xxx.db 'select 1 where 1=0'
%
% # ^ empty string returned

It's no big deal because I can just wrap it in an if statement, but I think it should be made consistent. Empty result should return [] instead of empty string.

Alternatively it would be nice if there was an option to receive jsonlines output directly, ie. results not wrapped in an array. Also it would be nice if I could receive json data directly using the python client.

created time in a day

issue commentcwida/duckdb

Slow read compared to SQLite

Any more details here @ucicelos

ucicelos

comment created time in a day

pull request commentcwida/duckdb

R package: Add ability to specify output timezone

Thanks for the PR, I wonder whether it might make sense to normalise dates on the way into DuckDB as well, so they are correctly stored as UTC there and you can then indeed convert them back into another TZ on querying.

ateucher

comment created time in a day

startedMarcStan/lets-encrypt-azure

started time in a day

pull request commentcwida/duckdb

STRING_SPLIT and STRING_SPLIT_REGEX SQL functions

Sorry, I turned off github notifications and forgot to turn them back on. No reason really. For consistency I suppose renaming to _regexp would be better

lnkuiper

comment created time in a day

push eventcwida/duckdb

Laurens Kuiper

commit sha ef7e01e52d39c00e73d1f658a32df8ab9453af6e

init deliminator

view details

Laurens Kuiper

commit sha 9173dace7138fe967ea03c508966cbd5cb4a5e40

debugging

view details

Laurens Kuiper

commit sha 6327339cefb2e121ffa0efe1e884166b49549692

deliminator working for most test cases now

view details

Laurens Kuiper

commit sha baad6738541f593c3c8b6e625cbaa2e758f741e0

deliminator filter NULL values

view details

Laurens Kuiper

commit sha bedd30c43bb59a9b38fac0cc04ca0c85fad8d4ed

Merge branch 'master' into deliminator

view details

Laurens Kuiper

commit sha 4336914db419969f595e9ff10e3ebfc7f5aad5a6

*WIP* keep track of 'projection aliases' of columns that are no longer duplicate eliminated

view details

Laurens Kuiper

commit sha a0eed38cb8c4c21265b87255ada0811e75fc05e9

refactor

view details

Laurens Kuiper

commit sha 0021303f6207c3a2168911bff93106081fa025c7

Merge branch 'master' into deliminator

view details

Laurens Kuiper

commit sha 3d8bf0dc56cc2b44e5bee7e079374f9c87b95f67

formatting

view details

Laurens Kuiper

commit sha 34eed70dc534fc80f276d46fbaffecedf2b4b3db

refactor deliminator done, TODO: fix all test cases

view details

Laurens Kuiper

commit sha ef389c7f417396722ad64d58aa12c34975745afb

fix some more tests (deliminator)

view details

Laurens Kuiper

commit sha 2989a8fe3dda5d2d2e7aab323f47df14deed335e

Merge branch 'master' into deliminator

view details

Laurens Kuiper

commit sha 657cff2e5c34b121384499eeb4118b2bbd5b024f

deal with correlated subqueries with aggregates/filters

view details

Laurens Kuiper

commit sha 6dc9a8b6c0c6ef97e7395d3300affa653a335b94

deliminator now deals with aggregation subqueries

view details

Laurens Kuiper

commit sha a5559067274abeb0214a7ae3e4d89ee47dea8905

keep track of delim_types in comparison joins to fix deliminator tests

view details

Laurens Kuiper

commit sha 89074e69c4e15b3ad3957f0555b13e6d848017f7

update comments

view details

Laurens Kuiper

commit sha 65a89850ea2cde1fc632a9fa92427c6682b9ce40

add deliminator test

view details

Laurens Kuiper

commit sha 030abc32fad4f26261f0a0ba59f6f2db24f2bf26

Merge branch 'master' into deliminator

view details

Laurens Kuiper

commit sha eafb53012f0a3c2b775924555a66dd2068e44abb

formatting

view details

Laurens Kuiper

commit sha 68bdc398b9bf35baaeb7ca30cf1f454fa7a7fbb7

refactor to use LogicalOperatorVisitor

view details

push time in a day

PR merged cwida/duckdb

Optimizer that removes redundant DELIM_GET and DELIM_JOIN operators

This PR implements an optimizer which I have affectionately called the Deliminator, an optimizer that removes redundant operators related to correlated subqueries.

When a query containing a correlated subquery is issued, a dependent join is created. Consider the following query:

EXPLAIN SELECT i=ANY(SELECT i FROM integers WHERE i=i1.i) FROM integers i1 ORDER BY i;

DuckDB has implemented a way of generically flattening these queries, following Unnesting Arbitrary Queries by Thomas Neumann and Alfons Kemper, to avoid quadratic complexity. This is done by creating a duplicate-eliminated join, or delim join, and pushing it down the query plan to decorrelate it.

This process introduces delim scans, which are usually joined (or in a cross product) with another part of the query plan. Under specific circumstances, joining with a delim scan does not introduce new information, and this can be removed. If all delim scans belonging to a delim join can be removed, the delim join can be transformed into a comparison join instead.

This is more efficient for two reasons:

  1. Joins with delim scans can be removed, so there is less work to be done
  2. Delim joins can be regular comparison joins instead, which are easier (and already) parallelised in DuckDB

For the example query this means that the original query plan

┌───────────────────────────┐                                                                                                                    
│         PROJECTION        │                                                                                                                    
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                                                                                                                    
│             #0            │                                                                                                                    
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                                                                                                                    
│             3             │                                                                                                                    
│          (0.00s)          │                                                                                                                    
└─────────────┬─────────────┘                                                                                                                                                 
┌─────────────┴─────────────┐                                                                                                                    
│          ORDER_BY         │                                                                                                                    
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                                                                                                                    
│           #1 ASC          │                                                                                                                    
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                                                                                                                    
│             3             │                                                                                                                    
│          (0.00s)          │                                                                                                                    
└─────────────┬─────────────┘                                                                                                                                                 
┌─────────────┴─────────────┐                                                                                                                    
│         PROJECTION        │                                                                                                                    
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                                                                                                                    
│          SUBQUERY         │                                                                                                                    
│             i             │                                                                                                                    
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                                                                                                                    
│             3             │                                                                                                                    
│          (0.00s)          │                                                                                                                    
└─────────────┬─────────────┘                                                                                                                                                 
┌─────────────┴─────────────┐                                                                                                                    
│         DELIM_JOIN        │                                                                                                                    
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                                                                                                                    
│            MARK           │                                                                                                                    
│            i=i            │                                                                                                                    
│            i=#0           ├──────────────┐──────────────────────────────────────────────────────────────────────────────────────┐              
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │              │                                                                                      │              
│             3             │              │                                                                                      │              
│          (0.00s)          │              │                                                                                      │              
└─────────────┬─────────────┘              │                                                                                      │                                           
┌─────────────┴─────────────┐┌─────────────┴─────────────┐                                                          ┌─────────────┴─────────────┐
│          SEQ_SCAN         ││         HASH_JOIN         │                                                          │       HASH_GROUP_BY       │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   ││   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                                                          │   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│          integers         ││            MARK           │                                                          │             #0            │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   ││            i=i            │                                                          │   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│             i             ││            i=#0           ├──────────────┐                                           │             0             │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   ││   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │              │                                           │          (0.00s)          │
│             3             ││             3             │              │                                           │                           │
│          (0.00s)          ││          (0.00s)          │              │                                           │                           │
└───────────────────────────┘└─────────────┬─────────────┘              │                                           └───────────────────────────┘                             
                             ┌─────────────┴─────────────┐┌─────────────┴─────────────┐                                                          
                             │         CHUNK_SCAN        ││         PROJECTION        │                                                          
                             │   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   ││   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                                                          
                             │             3             ││             i             │                                                          
                             │          (0.00s)          ││             #0            │                                                          
                             │                           ││   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                                                          
                             │                           ││             3             │                                                          
                             │                           ││          (0.00s)          │                                                          
                             └───────────────────────────┘└─────────────┬─────────────┘                                                                                       
                                                          ┌─────────────┴─────────────┐                                                          
                                                          │         HASH_JOIN         │                                                          
                                                          │   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                                                          
                                                          │           INNER           │                                                          
                                                          │            i=i            ├──────────────┐                                           
                                                          │   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │              │                                           
                                                          │             3             │              │                                           
                                                          │          (0.00s)          │              │                                           
                                                          └─────────────┬─────────────┘              │                                                                        
                                                          ┌─────────────┴─────────────┐┌─────────────┴─────────────┐                             
                                                          │         DELIM_SCAN        ││          SEQ_SCAN         │                             
                                                          │   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   ││   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
                                                          │             3             ││          integers         │                             
                                                          │          (0.00s)          ││   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
                                                          │                           ││             i             │                             
                                                          │                           ││   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
                                                          │                           ││             3             │                             
                                                          │                           ││          (0.00s)          │                             
                                                          └───────────────────────────┘└───────────────────────────┘                                                          

Can be simplified to:

┌───────────────────────────┐                             
│         PROJECTION        │                             
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
│             #0            │                             
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
│             3             │                             
│          (0.00s)          │                             
└─────────────┬─────────────┘                                                          
┌─────────────┴─────────────┐                             
│          ORDER_BY         │                             
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
│           #1 ASC          │                             
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
│             3             │                             
│          (0.00s)          │                             
└─────────────┬─────────────┘                                                          
┌─────────────┴─────────────┐                             
│         PROJECTION        │                             
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
│          SUBQUERY         │                             
│             i             │                             
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
│             3             │                             
│          (0.00s)          │                             
└─────────────┬─────────────┘                                                          
┌─────────────┴─────────────┐                             
│         HASH_JOIN         │                             
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
│            MARK           │                             
│            i=i            │                             
│            i=#0           ├──────────────┐              
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │              │              
│             3             │              │              
│          (0.00s)          │              │              
└─────────────┬─────────────┘              │                                           
┌─────────────┴─────────────┐┌─────────────┴─────────────┐
│          SEQ_SCAN         ││         PROJECTION        │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   ││   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│          integers         ││             i             │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   ││             i             │
│             i             ││   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   ││             3             │
│             3             ││          (0.00s)          │
│          (0.00s)          ││                           │
└───────────────────────────┘└─────────────┬─────────────┘                             
                             ┌─────────────┴─────────────┐
                             │          SEQ_SCAN         │
                             │   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
                             │          integers         │
                             │   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
                             │             i             │
                             │   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
                             │             3             │
                             │          (0.00s)          │
                             └───────────────────────────┘                             

The original projection can also be removed in this specific case, but in other cases the projection may project +(i, 3), in which case it is still necessary.

This will improve the performance of the FTS extension, and some TPC-H queries.

I am happy to receive any feedback on this PR!

+519 -11

1 comment

12 changed files

lnkuiper

pr closed time in a day

pull request commentcwida/duckdb

Optimizer that removes redundant DELIM_GET and DELIM_JOIN operators

Thanks, looks great now!

lnkuiper

comment created time in a day

Pull request review commentcwida/duckdb

Optimizer that removes redundant DELIM_GET and DELIM_JOIN operators

+#include "duckdb/optimizer/deliminator.hpp"++#include "duckdb/planner/expression/bound_columnref_expression.hpp"+#include "duckdb/planner/expression/bound_operator_expression.hpp"+#include "duckdb/planner/operator/logical_aggregate.hpp"+#include "duckdb/planner/operator/logical_delim_join.hpp"+#include "duckdb/planner/operator/logical_delim_get.hpp"+#include "duckdb/planner/operator/logical_filter.hpp"++namespace duckdb {++class DeliminatorPlanUpdater : LogicalOperatorVisitor {+public:+	DeliminatorPlanUpdater() {+	}+	//! Update the plan after a DelimGet has been removed+	void VisitOperator(LogicalOperator &op) override;+	void VisitExpression(unique_ptr<Expression> *expression) override;+	//! Whether the operator has one or more children of type DELIM_GET+	bool HasChildDelimGet(LogicalOperator &op);++	expression_map_t<Expression *> expr_map;+	column_binding_map_t<bool> projection_map;+	unique_ptr<LogicalOperator> temp_ptr;+};++void DeliminatorPlanUpdater::VisitOperator(LogicalOperator &op) {+	VisitOperatorChildren(op);+	VisitOperatorExpressions(op);+	// now check if this is a delim join that can be removed+	if (op.type == LogicalOperatorType::LOGICAL_DELIM_JOIN && !HasChildDelimGet(op)) {+		auto &delim_join = (LogicalDelimJoin &)op;+		auto decs = &delim_join.duplicate_eliminated_columns;+		for (auto &cond : delim_join.conditions) {+			if (cond.comparison != ExpressionType::COMPARE_EQUAL) {+				continue;+			}+			auto &colref = (BoundColumnRefExpression &)*cond.right;+			if (projection_map.find(colref.binding) != projection_map.end()) {+				// value on the right is a projection of removed DelimGet+				for (idx_t i = 0; i < decs->size(); i++) {+					if (decs->at(i)->Equals(cond.left.get())) {+						// the value on the left no longer needs to be a duplicate-eliminated column+						decs->erase(decs->begin() + i);+						break;+					}+				}+				// whether we applied an IS NOT NULL filter+				cond.null_values_are_equal = true; // projection_map[colref.binding];+			}+		}+		// change type if there are no more duplicate-eliminated columns+		if (decs->empty()) {+			delim_join.type = LogicalOperatorType::LOGICAL_COMPARISON_JOIN;+		}+	}+}++void DeliminatorPlanUpdater::VisitExpression(unique_ptr<Expression> *expression) {+	if (expr_map.find(expression->get()) != expr_map.end()) {+		*expression = expr_map[expression->get()]->Copy();+	} else {+		VisitExpressionChildren(**expression);+	}+}++bool DeliminatorPlanUpdater::HasChildDelimGet(LogicalOperator &op) {+	if (op.type == LogicalOperatorType::LOGICAL_DELIM_GET) {+		return true;+	}+	for (auto &child : op.children) {+		if (HasChildDelimGet(*child)) {+			return true;+		}+	}+	return false;+}++unique_ptr<LogicalOperator> Deliminator::Optimize(unique_ptr<LogicalOperator> op) {+	vector<unique_ptr<LogicalOperator> *> candidates;+	FindCandidates(&op, candidates);++	for (auto candidate : candidates) {+		DeliminatorPlanUpdater updater;+		if (RemoveCandidate(candidate, updater)) {+			updater.VisitOperator(*op);+		}+	}+	return op;+}++void Deliminator::FindCandidates(unique_ptr<LogicalOperator> *op_ptr,+                                 vector<unique_ptr<LogicalOperator> *> &candidates) {+	auto op = op_ptr->get();+	// search children before adding, so the deepest candidates get added first+	for (auto &child : op->children) {+		FindCandidates(&child, candidates);+	}+	if ( // Projection/Aggregate

Agreed. I split it up and it is much more readable now

lnkuiper

comment created time in a day

startedankoh/duckdb.js

started time in 2 days

more