profile
viewpoint
Sreeharsha Ramanavarapu sreeharshar84 Mostly worked at Database companies: https://www.linkedin.com/in/sree-harsha-ramanavarapu-45b8617/

sreeharshar84/duckdb 1

DuckDB is an embeddable SQL OLAP Database Management System

sreeharshar84/dlt-daemon 0

Diagnostic Log and Trace.

fork Mytherin/libxbr-standalone

Library implementing the xBR pixel art scaling algorithm

fork in 14 days

startedTreeki/libxbr-standalone

started time in 14 days

MemberEvent

pull request commentcwida/duckdb

Add support for parallel scanning of pandas data frames

Now with projection pushdown:

┌─────────────────────────────────────┐
│┌───────────────────────────────────┐│
││         Total Time: 4.14s         ││
│└───────────────────────────────────┘│
└─────────────────────────────────────┘
┌───────────────────────────┐
│          EXECUTE          │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│             4             │
│          (0.00s)          │
└─────────────┬─────────────┘                             
┌─────────────┴─────────────┐
│          ORDER_BY         │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│           #0 ASC          │
│           #1 ASC          │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│             4             │
│          (0.00s)          │
└─────────────┬─────────────┘                             
┌─────────────┴─────────────┐
│       HASH_GROUP_BY       │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│             #0            │
│             #1            │
│          sum(#2)          │
│          sum(#3)          │
│          sum(#4)          │
│          sum(#5)          │
│          avg(#6)          │
│          avg(#7)          │
│          avg(#8)          │
│        count_star()       │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│             4             │
│          (1.66s)          │
└─────────────┬─────────────┘                             
┌─────────────┴─────────────┐
│         PROJECTION        │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│        l_returnflag       │
│        l_linestatus       │
│         l_quantity        │
│      l_extendedprice      │
│             #4            │
│ *(#4, +(1.000000, l_tax)) │
│         l_quantity        │
│      l_extendedprice      │
│         l_discount        │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│          59142609         │
│          (0.45s)          │
└─────────────┬─────────────┘                             
┌─────────────┴─────────────┐
│         PROJECTION        │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│        l_returnflag       │
│        l_linestatus       │
│         l_quantity        │
│      l_extendedprice      │
│   *(l_extendedprice, -(1  │
│   .000000, l_discount))   │
│           l_tax           │
│         l_discount        │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│          59142609         │
│          (0.47s)          │
└─────────────┬─────────────┘                             
┌─────────────┴─────────────┐
│         PROJECTION        │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│             #0            │
│             #1            │
│             #2            │
│             #3            │
│             #4            │
│             #5            │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│          59142609         │
│          (0.03s)          │
└─────────────┬─────────────┘                             
┌─────────────┴─────────────┐
│           FILTER          │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│ l_shipdate<=1998-09-02 00 │
│           :00:00          │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│          59142609         │
│          (0.10s)          │
└─────────────┬─────────────┘                             
┌─────────────┴─────────────┐
│        PANDAS_SCAN        │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│         l_quantity        │
│      l_extendedprice      │
│         l_discount        │
│           l_tax           │
│        l_returnflag       │
│        l_linestatus       │
│         l_shipdate        │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│          59986052         │
│          (1.40s)          │
└───────────────────────────┘ 
Mytherin

comment created time in a month

pull request commentcwida/duckdb

Add support for parallel scanning of pandas data frames

Some profiling output for those interested:

┌─────────────────────────────────────┐
│┌───────────────────────────────────┐│
││         Total Time: 5.92s         ││
│└───────────────────────────────────┘│
└─────────────────────────────────────┘
┌───────────────────────────┐
│          EXECUTE          │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│             4             │
│          (0.00s)          │
└─────────────┬─────────────┘
┌─────────────┴─────────────┐
│          ORDER_BY         │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│           #0 ASC          │
│           #1 ASC          │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│             4             │
│          (0.00s)          │
└─────────────┬─────────────┘
┌─────────────┴─────────────┐
│       HASH_GROUP_BY       │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│             #0            │
│             #1            │
│          sum(#2)          │
│          sum(#3)          │
│          sum(#4)          │
│          sum(#5)          │
│          avg(#6)          │
│          avg(#7)          │
│          avg(#8)          │
│        count_star()       │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│             4             │
│          (1.60s)          │
└─────────────┬─────────────┘
┌─────────────┴─────────────┐
│         PROJECTION        │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│        l_returnflag       │
│        l_linestatus       │
│         l_quantity        │
│      l_extendedprice      │
│             #4            │
│ *(#4, +(1.000000, l_tax)) │
│         l_quantity        │
│      l_extendedprice      │
│         l_discount        │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│          59142609         │
│          (0.40s)          │
└─────────────┬─────────────┘
┌─────────────┴─────────────┐
│         PROJECTION        │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│        l_returnflag       │
│        l_linestatus       │
│         l_quantity        │
│      l_extendedprice      │
│   *(l_extendedprice, -(1  │
│   .000000, l_discount))   │
│           l_tax           │
│         l_discount        │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│          59142609         │
│          (0.40s)          │
└─────────────┬─────────────┘
┌─────────────┴─────────────┐
│         PROJECTION        │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│             #0            │
│             #1            │
│             #2            │
│             #3            │
│             #4            │
│             #5            │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│          59142609         │
│          (0.04s)          │
└─────────────┬─────────────┘
┌─────────────┴─────────────┐
│           FILTER          │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│ l_shipdate<=1998-09-02 00 │
│           :00:00          │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│          59142609         │
│          (0.11s)          │
└─────────────┬─────────────┘
┌─────────────┴─────────────┐
│         PROJECTION        │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│             #4            │
│             #5            │
│             #6            │
│             #7            │
│             #8            │
│             #9            │
│            #10            │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│          59986052         │
│          (0.03s)          │
└─────────────┬─────────────┘
┌─────────────┴─────────────┐
│        PANDAS_SCAN        │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│         l_quantity        │
│      l_extendedprice      │
│         l_discount        │
│           l_tax           │
│        l_returnflag       │
│        l_linestatus       │
│         l_shipdate        │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│          59986052         │
│          (3.29s)          │
└───────────────────────────┘

It seems that Pandas string conversions are still (relatively) expensive, but not crazily so. I also just realized the pandas scan doesn't do projection pushdown yet, will add that next.

Mytherin

comment created time in a month

pull request commentcwida/duckdb

Add support for parallel scanning of pandas data frames

Also CC @tdoehmen

Mytherin

comment created time in a month

pull request commentcwida/duckdb

Add support for parallel scanning of pandas data frames

Simple benchmark running TPC-H Q1 on SF1 from a pandas dataframe:

threads Timing (s)
Single-Threaded 0.60
8 Threads 0.11s

Script:

import duckdb
import time

con = duckdb.connect()
df = con.execute("select * from read_csv_auto('lineitem-colname.csv')").fetchdf()
con.register('lineitem_df', df)

start = time.time()
print(con.execute('''SELECT
     l_returnflag,
     l_linestatus,
     sum(l_quantity) AS sum_qty,
     sum(l_extendedprice) AS sum_base_price,
     sum(l_extendedprice * (1 - l_discount)) AS sum_disc_price,
     sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge,
     avg(l_quantity) AS avg_qty,
     avg(l_extendedprice) AS avg_price,
     avg(l_discount) AS avg_disc,
     count(*) AS count_order
 FROM
     lineitem_df
 WHERE
     l_shipdate <= CAST('1998-09-02' AS date)
 GROUP BY
     l_returnflag,
     l_linestatus
 ORDER BY
     l_returnflag,
     l_linestatus;
 ''').fetchdf())
end = time.time()
print('Single threaded ', end - start)

con.execute('pragma threads=8')

start = time.time()
print(con.execute('''SELECT
     l_returnflag,
     l_linestatus,
     sum(l_quantity) AS sum_qty,
     sum(l_extendedprice) AS sum_base_price,
     sum(l_extendedprice * (1 - l_discount)) AS sum_disc_price,
     sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge,
     avg(l_quantity) AS avg_qty,
     avg(l_extendedprice) AS avg_price,
     avg(l_discount) AS avg_disc,
     count(*) AS count_order
 FROM
     lineitem_df
 WHERE
     l_shipdate <= CAST('1998-09-02' AS date)
 GROUP BY
     l_returnflag,
     l_linestatus
 ORDER BY
     l_returnflag,
     l_linestatus;
 ''').fetchdf())
end = time.time()
print('8 Threads ', end - start)

Data generation (needs TPC-H extension):

CALL dbgen(sf=1);
COPY lineitem TO 'lineitem-colname.csv' (HEADER);
Mytherin

comment created time in a month

pull request commentcwida/duckdb

Add support for parallel scanning of pandas data frames

@Alex-Monahan @tdoehmen

Mytherin

comment created time in a month

PR opened cwida/duckdb

Add support for parallel scanning of pandas data frames

This PR adds support for parallel scans of pandas dataframes, so queries running with pandas dataframes as a source can also benefit from intra-pipeline parallelism.

+81 -9

0 comment

2 changed files

pr created time in a month

issue commentcwida/duckdb

Crash on single-argument COALESCE

Works fine now!

On Tue, Dec 15, 2020 at 5:12 PM Hannes Mühleisen notifications@github.com wrote:

This works fine for me, I suspect you still have an old version. Whats your output of dbGetQuery(con, "pragma version")? I get

library_version source_id 1 0.2.4-dev103 5eff21830

— You are receiving this because you were mentioned. Reply to this email directly, view it on GitHub https://github.com/cwida/duckdb/issues/1222#issuecomment-745156740, or unsubscribe https://github.com/notifications/unsubscribe-auth/AB7PRZBKRL23MMVFK5QQOULSU4SAJANCNFSM4UZJXYOA .

alanpaulkwan

comment created time in a month

issue commentcwida/duckdb

Crash on single-argument COALESCE

This works fine for me, I suspect you still have an old version. Whats your output of dbGetQuery(con, "pragma version")? I get

  library_version source_id
1    0.2.4-dev103 5eff21830
alanpaulkwan

comment created time in a month

issue commentcwida/duckdb

Crash on single-argument COALESCE

Sorry, what I mean was: I just reinstalled the latest version via the install instructions on your DuckDB page.

image

alanpaulkwan

comment created time in a month

issue commentcwida/duckdb

Nodejs API - Support for outputting data as binary columns.

@jpkli

I actually want the whole column to be returned as a TypedArray based on the DuckDB field type.

Would you provide more info on your use case? I'm curious what you're trying to do

@willium

I'm curious what the difference in objective is between node-duckdb and the one bundled in this repo.

In terms of objective in the grand scheme of things there isn't a difference. We developed node-duckdb for our use case at DeepCrawl and released it in prod 3 days after the DuckDb team announced the release of their bindings. In terms of smaller differences, it's written in TS which is important to us as we're a TS company.

It seems like this early in the development, it'd be great to rally around one client.

We're definitely open for collaboration 👍

TypeScript is great, we also use it. But it would be nice to adapt the first-party tool to typescript and better typings rather than have a parallel tool :)

I took a look at your implementation of fetchRow--it looks great!--can I ask why you don't parse all the rows in the current chunk rather than just one at a time?

jpkli

comment created time in a month

issue commentcwida/duckdb

Windows: duckdb.dll missing export functions.

I will do that

chilarai

comment created time in a month

pull request commentcwida/duckdb

Fix for parallel pandas scans (fixes #1220)

So in DuckDB there are two types of parallelism within a single query: inter-pipeline and intra-pipeline parallelism. Pipelines are segments of a query tree that can be run independently. For example, if we look at the query that you supplied in the issue:

explain select *
from main_table
left join left_join_table t1
	on main_table.join_column = t1.join_column
left join left_join_table t2
	on main_table.join_column = t2.join_column;
┌─────────────────────────────┐
│┌───────────────────────────┐│
││       Physical Plan       ││
│└───────────────────────────┘│
└─────────────────────────────┘
┌───────────────────────────┐                                                          
│         HASH_JOIN         │                                                          
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                                                          
│            LEFT           ├───────────────────────────────────────────┐              
│  join_column=join_column  │                                           │              
└─────────────┬─────────────┘                                           │                                           
┌─────────────┴─────────────┐                             ┌─────────────┴─────────────┐
│         HASH_JOIN         │                             │          SEQ_SCAN         │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             │   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│            LEFT           │                             │      left_join_table      │
│  join_column=join_column  ├──────────────┐              │   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│                           │              │              │        join_column        │
│                           │              │              │        other_column       │
└─────────────┬─────────────┘              │              └───────────────────────────┘                             
┌─────────────┴─────────────┐┌─────────────┴─────────────┐                             
│          SEQ_SCAN         ││          SEQ_SCAN         │                             
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   ││   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
│         main_table        ││      left_join_table      │                             
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   ││   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │                             
│        join_column        ││        join_column        │                             
│                           ││        other_column       │                             
└───────────────────────────┘└───────────────────────────┘                                                          

There are three pipelines. First, the two hash table builds for the join(hash tables are always built on the right side). In this case we have SEQ_SCAN(left_join_table) -> HASH_JOIN (build) and SEQ_SCAN(left_join_table) -> HASH_JOIN (build) (the bottom and the top one). Then we have the final pipeline, that goes from SEQ_SCAN(main_table) -> HASH_JOIN (probe) -> HASH_JOIN (probe) -> Result. The final pipeline is dependent on the first two pipelines (the two hash table builds). However, the two hash table builds are separate, and can thus be executed independently. This is what triggered the bug: the left_join_table was scanned in parallel in the two separate pipelines. This is what we call inter-pipeline parallelism. This can happen regardless of which scans you are using as long as the pipelines are independent, but this depends on the complexity of the query. Simple queries do not have many pipelines (e.g. SELECT SUM(i) FROM table has only one pipeline).

Intra-pipeline parallelism is when the data source (i.e. the scan over the pandas dataframe, in this case) is partitioned and the pipeline itself is run in parallel. This requires the scan operator to "know" about the partitioning, i.e. we need to turn the scan into a parallel scan. This is not very complicated, but just hasn't happened yet for Pandas DataFrames. Currently this is only enabled for base table scans and for parquet scans.

Here are some slides if you want to read more:

duckdb-parallelism.pdf

Mytherin

comment created time in a month

pull request commentcwida/duckdb

Fix for parallel pandas scans (fixes #1220)

Just a small question for my own understanding: If we have two (or more) different dataframes that we have registered and are querying both in the same SQL statement, will those scans be in parallel? (Ex: scanning DF1 and DF2 at the same time)

Mytherin

comment created time in a month

issue commentcwida/duckdb

Threading issue: Multiple left joins to same table and where clause kills Python process

I can confirm that this fixes my issue!

Alex-Monahan

comment created time in a month

pull request commentcwida/duckdb

Fix for parallel pandas scans (fixes #1220)

Note this is only for Pandas -> DuckDB, not for DuckDB -> Pandas. More performance improvement need to happen there. We also still need to enable parallel scans on Pandas DataFrames (as in, partitioning the DataFrame into separate chunks, the parallel scans mentioned here are only for when the same DataFrame is used twice in one query), and parallel Pandas DataFrame construction.

Mytherin

comment created time in a month

pull request commentcwida/duckdb

Fix for parallel pandas scans (fixes #1220)

Great! Looking forward to try it out. I'll report some numbers soon

Mytherin

comment created time in a month

push eventcwida/duckdb

Mark Raasveldt

commit sha 4983b0089b5add429d1f59834546aa4e8d329466

Fix parallel pandas scans by ensuring no operations are used that require the GIL (i.e. no objects are constructed). This should fix #1220.

view details

Mark Raasveldt

commit sha f78f2264473a655b8fb1260b0d9fb5242f3cd8c4

Some Python 2 cleanup, old pandas versions may not have to_numpy attr

view details

Mark Raasveldt

commit sha 79377c23b99f2e69cdc1297ecdb30ac9cdc51a50

Just upgrade the pandas version on jewels

view details

Mark Raasveldt

commit sha 86944e47525e42891781e4d93bdacf446fd51b6f

u"" for python2

view details

Mark Raasveldt

commit sha 91465c937ef60558a03f1c6a03b341f157177745

These should use u"" as well

view details

Mark Raasveldt

commit sha 6029121b378ba2aa912f8192e6885fe5ed548f7d

Remove unicode support for Python 2

view details

Mark Raasveldt

commit sha 2bc0dfc87a3ff58ca41bb8676f4f5c29f06c6a88

Also skip this test on python 2

view details

Mark

commit sha 5eff2183077c4ce6fd77e033495b9fe4ca815766

Merge pull request #1226 from Mytherin/pandasscanfix Fix for parallel pandas scans (fixes #1220)

view details

push time in a month

issue closedcwida/duckdb

Threading issue: Multiple left joins to same table and where clause kills Python process

Hey Folks,

I'm seeing some failures when attempting to use multithreading. The failure mode is that the Python process completely dies.

I've isolated at least one basic query that is not working for some reason (It's not 100% of the time that it fails, but close to it. Maybe some kind of race condition?). It runs correctly if threads are not enabled (when pragma threads line of code is comment out). It also runs if the where clause is not present. It also runs if the left join table is only used once.

I'm using the Python client (Python 3.7.7 on Windows, DuckDB 0.2.3).

Thanks for your help with this! We were upgrading from 0.1.9 to 0.2.2 to get multithreading and some of the other syntax goodness you've added lately and had to revert the multithreading piece.

Thanks, Alex

#This is broken:
import duckdb
import pandas as pd

main_table = pd.DataFrame([{"join_column":"text"}])
left_join_table = pd.DataFrame([{"join_column":"text","other_column":"more text"}])

sql = """
select
    main_table.*
    ,t1.*
    ,t2.*
from main_table 
left join left_join_table t1
    on main_table.join_column = t1.join_column
left join left_join_table t2
    on main_table.join_column = t2.join_column
where
    t1.other_column = 'more text'
"""

try:
    duckdb_conn = duckdb.connect()
    duckdb_conn.execute("PRAGMA threads=4")
    duckdb_conn.register('main_table',main_table)
    duckdb_conn.register('left_join_table',left_join_table)
    output_df = duckdb_conn.execute(sql).fetchdf()
except Exception as err:
    print(err)
finally:
    duckdb_conn.close()
    
output_df
#This works:
import duckdb
import pandas as pd

main_table = pd.DataFrame([{"join_column":"text"}])
left_join_table = pd.DataFrame([{"join_column":"text","other_column":"more text"}])

sql = """
select
    main_table.*
    ,t1.*
    ,t2.*
from main_table 
left join left_join_table t1
    on main_table.join_column = t1.join_column
left join left_join_table t2
    on main_table.join_column = t2.join_column
"""

try:
    duckdb_conn = duckdb.connect()
    duckdb_conn.execute("PRAGMA threads=4")
    duckdb_conn.register('main_table',main_table)
    duckdb_conn.register('left_join_table',left_join_table)
    output_df = duckdb_conn.execute(sql).fetchdf()
except Exception as err:
    print(err)
finally:
    duckdb_conn.close()
    
output_df
#This works:
import duckdb
import pandas as pd

main_table = pd.DataFrame([{"join_column":"text"}])
left_join_table = pd.DataFrame([{"join_column":"text","other_column":"more text"}])

sql = """
select
    main_table.*
    ,t1.*
    ,t2.*
from main_table 
left join left_join_table t1
    on main_table.join_column = t1.join_column
left join left_join_table t2
    on main_table.join_column = t2.join_column
where
    t1.other_column = 'more text'
"""

try:
    duckdb_conn = duckdb.connect()

    duckdb_conn.register('main_table',main_table)
    duckdb_conn.register('left_join_table',left_join_table)
    output_df = duckdb_conn.execute(sql).fetchdf()
except Exception as err:
    print(err)
finally:
    duckdb_conn.close()
    
output_df

closed time in a month

Alex-Monahan

PR merged cwida/duckdb

Fix for parallel pandas scans (fixes #1220)

Pandas scans can happen in parallel in threads that do not hold the GIL when background threads are enabled (i.e. PRAGMA threads=X where X > 1). This PR modifies the pandas scans to make sure that we are not constructing Python objects or doing anything that can invoke race conditions (e.g. adding new references to objects). This PR should also significantly speed up pandas scans of strings, "nullable" integers (e.g. Int8, Int16) and timestamps with timezones, as we are now directly scanning the underlying NumPy arrays rather than doing Python object construction.

+261 -141

4 comments

8 changed files

Mytherin

pr closed time in a month

Pull request review commentcwida/duckdb

Filter Pull Up

+//===----------------------------------------------------------------------===//+//                         DuckDB+//+// duckdb/optimizer/filter_pullup.hpp+//+//+//===----------------------------------------------------------------------===//++#pragma once++#include "duckdb/common/unordered_set.hpp"+#include "duckdb/planner/logical_operator.hpp"+#include "duckdb/planner/operator/logical_filter.hpp"+#include <memory>+#include <vector>++namespace duckdb {++class Optimizer;++class FilterPullup {+public:+    FilterPullup(Optimizer &optimizer) : optimizer(optimizer) {+    }++    FilterPullup(Optimizer &optimizer, unique_ptr<LogicalOperator>::pointer root_proj, bool fork=false) : +                 optimizer(optimizer),  root_pullup_node_ptr(root_proj), fork(fork) {+    }++    //! Perform filter pullup+    unique_ptr<LogicalOperator> Rewrite(unique_ptr<LogicalOperator> node);++private:+    vector<unique_ptr<Expression>> filters_expr_pullup;

We prefer not to mix variables and functions in the same private/public block, but rather do e.g.:

private:
    void Function1();
    void Function2();

private:
    int var1;
    int var2;
tiagokepe

comment created time in a month

Pull request review commentcwida/duckdb

Filter Pull Up

+#include "duckdb/optimizer/filter_pullup.hpp"+#include "duckdb/planner/expression/bound_columnref_expression.hpp"+#include "duckdb/planner/expression_iterator.hpp"+#include "duckdb/planner/operator/logical_empty_result.hpp"+#include "duckdb/planner/operator/logical_projection.hpp"+#include "duckdb/planner/expression/bound_comparison_expression.hpp"++namespace duckdb {+using namespace std;++static Expression *GetColumnRefExpression(Expression &expr) {+    if (expr.type == ExpressionType::BOUND_COLUMN_REF) {+        return &expr;+    }+    ExpressionIterator::EnumerateChildren(expr, [&](Expression &child) { return GetColumnRefExpression(child); });+    return &expr;+}++static bool GenerateBinding(LogicalProjection &proj, BoundColumnRefExpression &colref, ColumnBinding &binding) {+    D_ASSERT(colref.depth == 0);+    int column_index = -1;+    // find the corresponding column index in the projection+    for(idx_t proj_idx=0; proj_idx < proj.expressions.size(); proj_idx++) {+        auto proj_colref = GetColumnRefExpression(*proj.expressions[proj_idx]);

It seems that this can be replaced with auto proj_colref = proj.expressions[proj_idx].get() and the function GetColumnRefExpression can be removed.

tiagokepe

comment created time in a month

Pull request review commentcwida/duckdb

Filter Pull Up

+#include "duckdb/optimizer/filter_pullup.hpp"+#include "duckdb/planner/operator/logical_join.hpp"++namespace duckdb {+using namespace std;++unique_ptr<LogicalOperator> FilterPullup::Rewrite(unique_ptr<LogicalOperator> op) {+    switch (op->type) {+        case LogicalOperatorType::LOGICAL_FILTER:+            return PullupFilter(move(op));+        case LogicalOperatorType::LOGICAL_PROJECTION:+            return PullupProjection(move(op));+        case LogicalOperatorType::LOGICAL_CROSS_PRODUCT:+            return PullupCrossProduct(move(op));+        case LogicalOperatorType::LOGICAL_COMPARISON_JOIN:+        case LogicalOperatorType::LOGICAL_ANY_JOIN:+        case LogicalOperatorType::LOGICAL_DELIM_JOIN:+            return PullupJoin(move(op));+        case LogicalOperatorType::LOGICAL_INTERSECT:

Could you also handle LOGICAL_DISTINCT and LOGICAL_ORDER_BY? These are rather simple, as you can just pull up filters directly through them (whether or not the filter is before or after the operator does not matter - the operators are unaffected).

tiagokepe

comment created time in a month

Pull request review commentcwida/duckdb

Filter Pull Up

+#include "duckdb/optimizer/filter_pullup.hpp"+#include "duckdb/planner/expression/bound_columnref_expression.hpp"+#include "duckdb/planner/expression_iterator.hpp"+#include "duckdb/planner/operator/logical_empty_result.hpp"+#include "duckdb/planner/operator/logical_projection.hpp"+#include "duckdb/planner/expression/bound_comparison_expression.hpp"++namespace duckdb {+using namespace std;++static Expression *GetColumnRefExpression(Expression &expr) {+    if (expr.type == ExpressionType::BOUND_COLUMN_REF) {+        return &expr;+    }+    ExpressionIterator::EnumerateChildren(expr, [&](Expression &child) { return GetColumnRefExpression(child); });+    return &expr;+}++static bool GenerateBinding(LogicalProjection &proj, BoundColumnRefExpression &colref, ColumnBinding &binding) {+    D_ASSERT(colref.depth == 0);+    int column_index = -1;+    // find the corresponding column index in the projection+    for(idx_t proj_idx=0; proj_idx < proj.expressions.size(); proj_idx++) {+        auto proj_colref = GetColumnRefExpression(*proj.expressions[proj_idx]);+        if (proj_colref->type == ExpressionType::BOUND_COLUMN_REF) {+            // auto proj_colref = (BoundColumnRefExpression *)proj.expressions[proj_idx].get();+            if(colref.Equals(proj_colref)) {+                column_index = proj_idx;+                break;+            }+        }+    }+    // Case the filter column is not projected, returns false+    if(column_index == -1) {+        return false;+    }+    binding.table_index = proj.table_index;+    binding.column_index = column_index;+    return true;+}++static bool ReplaceFilterBindings(LogicalProjection &proj, Expression &expr) {+    // we do not use ExpressionIterator here because we need to check if the filtered column is being projected,+    // otherwise we should avoid the filter to be pulled up by returning false+    if(expr.expression_class == ExpressionClass::BOUND_COMPARISON) {+        auto &comp_expr = (BoundComparisonExpression &)expr;+        unique_ptr<BoundColumnRefExpression> left_expr, right_expr;+        if(comp_expr.left->type == ExpressionType::BOUND_COLUMN_REF) {+            auto &colref = (BoundColumnRefExpression &)*comp_expr.left;+            ColumnBinding binding;+            if(GenerateBinding(proj, colref, binding) == false) {+                // the filtered column is not projected, this filter doesn't need to be pulled up+                return false;+            }+            left_expr = make_unique<BoundColumnRefExpression>(colref.alias, colref.return_type, binding, colref.depth);+        }+        if(comp_expr.right->type == ExpressionType::BOUND_COLUMN_REF) {+            auto &colref = (BoundColumnRefExpression &)*comp_expr.right;+            ColumnBinding binding;+            if(GenerateBinding(proj, colref, binding) == false) {+                // the filtered column is not projected, this filter doesn't need to be pulled up+                return false;+            }+            right_expr = make_unique<BoundColumnRefExpression>(colref.alias, colref.return_type, binding, colref.depth);+        }+        if(left_expr) {+            comp_expr.left = move(left_expr);+        }+        if(right_expr) {+            comp_expr.right = move(right_expr);+        }+    }+    return true;+}++static void RevertFilterPullup(LogicalProjection &proj, vector<unique_ptr<Expression>> &expressions) {+    unique_ptr<LogicalFilter> filter = make_unique<LogicalFilter>();+    for(idx_t i=0; i < expressions.size(); ++i) {+        filter->expressions.push_back(move(expressions[i]));+    }+    filter->children.push_back(move(proj.children[0]));+    proj.children[0] = move(filter);+}++unique_ptr<LogicalOperator> FilterPullup::PullupProjection(unique_ptr<LogicalOperator> op) {+    D_ASSERT(op->type == LogicalOperatorType::LOGICAL_PROJECTION);+    if(root_pullup_node_ptr == nullptr) {+        root_pullup_node_ptr = op.get();+    }+    op->children[0] = Rewrite(move(op->children[0]));+    if(root_pullup_node_ptr == op.get() && filters_expr_pullup.size() > 0) {+        return GeneratePullupFilter(move(op), filters_expr_pullup);+    }+    if(filters_expr_pullup.size() > 0) {+        auto &proj = (LogicalProjection &)*op;+        vector<unique_ptr<Expression>> expressions_to_revert;+        for(idx_t i=0; i < filters_expr_pullup.size(); ++i) {+            auto &expr =  (Expression &)*filters_expr_pullup[i];+            if(!ReplaceFilterBindings(proj, expr)) {

Nice job on the projection pushdown, this is indeed quite complicated. Rather than reverting it, I would rather add any column bindings that you need to complete the filter. For example, suppose you have the following plan fragment:

┌───────────────────────────┐
│         PROJECTION        │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│          +(i, 1)          │
└─────────────┬─────────────┘                             
┌─────────────┴─────────────┐
│           FILTER          │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│            i=2            │
└─────────────┬─────────────┘                             
┌─────────────┴─────────────┐
│          SEQ_SCAN         │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│          integers         │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│             i             │
└───────────────────────────┘

You can extend the projection by adding the column i, and turn it into the following plan:

┌─────────────┴─────────────┐
│           FILTER          │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│            i=2            │
└─────────────┬─────────────┘            
┌───────────────────────────┐
│         PROJECTION        │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│        +(i, 1), i         │
└─────────────┬─────────────┘                 
┌─────────────┴─────────────┐
│          SEQ_SCAN         │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│          integers         │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│             i             │
└───────────────────────────┘

Any columns that you add will be removed again by the filter pushdown optimizer and the unused column remover.

Note that you can only add columns if it is a non-root projection, since adding columns to the root projection will also mean adding columns to the result. You can check the RemoveUnusedColumns optimizer for how that is handled there (the RemoveUnusedColumns optimizer has a parameter on construction is_root that defaults to false, it is only set to true when it is first created on the root of the tree).

tiagokepe

comment created time in a month

Pull request review commentcwida/duckdb

Filter Pull Up

+#include "duckdb/optimizer/filter_pullup.hpp"+#include "duckdb/planner/operator/logical_filter.hpp"+#include "duckdb/planner/expression/bound_comparison_expression.hpp"+#include "duckdb/planner/expression_iterator.hpp"+#include "duckdb/planner/expression/bound_between_expression.hpp"++namespace duckdb {+using namespace std;++static bool IsComparisonFodable(unique_ptr<Expression> &expr) {+	if(expr->GetExpressionClass() == ExpressionClass::BOUND_COMPARISON) {+		auto &comparison = (BoundComparisonExpression &)*expr;+		// check if one of the sides is a scalar value+		return comparison.left->IsFoldable() || comparison.right->IsFoldable();+	}++	if(expr->GetExpressionClass() == ExpressionClass::BOUND_BETWEEN) {+		auto &comparison = (BoundBetweenExpression &)*expr;+		//! check if one of the sides is a scalar value+		return  comparison.lower->IsFoldable() || comparison.upper->IsFoldable();+	}+	//TODO it's missing to treat the case of duckdb::ExpressionType::CONJUNCTION_OR+	ExpressionIterator::EnumerateChildren(+	    *expr, [&](unique_ptr<Expression> &child) { return IsComparisonFodable(child); });+	return false;+}++static bool IsFilterFodable(vector<unique_ptr<Expression>> &expressions) {+	for(auto &expr: expressions) {+		if(!IsComparisonFodable(expr)) {+			return false;+		}+	}+	return true;+}++unique_ptr<LogicalOperator> FilterPullup::PullupFilter(unique_ptr<LogicalOperator> op) {+	D_ASSERT(op->type == LogicalOperatorType::LOGICAL_FILTER);++	if(fork && IsFilterFodable(op->expressions)) {

I don't quite follow the code here - why does a filter need to be foldable for us to pull up from it? If we have a bunch of filters that we are pulling up, and encounter another filter, we can just combine the filters no?

It does not make sense to handle foldable filters either way, no? Foldable filters are constant filters (e.g. WHERE 1=0). They can be removed entirely in the first place, since we know if they will return true or false for all tuples.

tiagokepe

comment created time in a month

Pull request review commentcwida/duckdb

Filter Pull Up

+//===----------------------------------------------------------------------===//+//                         DuckDB+//+// duckdb/optimizer/filter_pullup.hpp+//+//+//===----------------------------------------------------------------------===//++#pragma once++#include "duckdb/common/unordered_set.hpp"+#include "duckdb/planner/logical_operator.hpp"+#include "duckdb/planner/operator/logical_filter.hpp"+#include <memory>+#include <vector>++namespace duckdb {++class Optimizer;++class FilterPullup {+public:+    FilterPullup(Optimizer &optimizer) : optimizer(optimizer) {+    }++    FilterPullup(Optimizer &optimizer, unique_ptr<LogicalOperator>::pointer root_proj, bool fork=false) : +                 optimizer(optimizer),  root_pullup_node_ptr(root_proj), fork(fork) {+    }++    //! Perform filter pullup+    unique_ptr<LogicalOperator> Rewrite(unique_ptr<LogicalOperator> node);++private:+    vector<unique_ptr<Expression>> filters_expr_pullup;+    Optimizer &optimizer;+    // node resposible for pulling up filters+    unique_ptr<LogicalOperator>::pointer root_pullup_node_ptr = nullptr;

I think it would be cleaner to remove the root_pullup_node_ptr, but have every operator handle the pull up of expressions itself (as long as fork/can_pullup is true).

tiagokepe

comment created time in a month

Pull request review commentcwida/duckdb

Filter Pull Up

+#include "duckdb/optimizer/filter_pullup.hpp"+#include "duckdb/planner/operator/logical_filter.hpp"+#include "duckdb/planner/expression/bound_comparison_expression.hpp"+#include "duckdb/planner/expression_iterator.hpp"+#include "duckdb/planner/expression/bound_between_expression.hpp"++namespace duckdb {+using namespace std;++static bool IsComparisonFodable(unique_ptr<Expression> &expr) {

Can't you directly use expr->IsFoldable() here?

tiagokepe

comment created time in a month

Pull request review commentcwida/duckdb

Filter Pull Up

+//===----------------------------------------------------------------------===//+//                         DuckDB+//+// duckdb/optimizer/filter_pullup.hpp+//+//+//===----------------------------------------------------------------------===//++#pragma once++#include "duckdb/common/unordered_set.hpp"+#include "duckdb/planner/logical_operator.hpp"+#include "duckdb/planner/operator/logical_filter.hpp"+#include <memory>+#include <vector>++namespace duckdb {++class Optimizer;++class FilterPullup {+public:+    FilterPullup(Optimizer &optimizer) : optimizer(optimizer) {+    }++    FilterPullup(Optimizer &optimizer, unique_ptr<LogicalOperator>::pointer root_proj, bool fork=false) : +                 optimizer(optimizer),  root_pullup_node_ptr(root_proj), fork(fork) {+    }++    //! Perform filter pullup+    unique_ptr<LogicalOperator> Rewrite(unique_ptr<LogicalOperator> node);++private:+    vector<unique_ptr<Expression>> filters_expr_pullup;+    Optimizer &optimizer;+    // node resposible for pulling up filters+    unique_ptr<LogicalOperator>::pointer root_pullup_node_ptr = nullptr;+    // only pull up filters when there is a fork+    bool fork = false;

Could you rename this to can_pullup?

tiagokepe

comment created time in a month

more