This article compares the performance of ElasticSearch and ClickHouse, using the ontime data set of ClickHouse
Ontime Dataset
Data explain:
- ClickHouse Official dataset, website: ontime
- The ontime data set is the aviation delay data of the United States from 1987 to 2021
- Including 110 fields such as departure city, arrival city, flight number, departure delay, arrival delay, departure time, arrival time, etc.
- Contain multiple types such as integer, string, time, etc.
Hardware: 8c 32g + ESSD PL1 4000GB, 4 nodes
Version:
- Elasticsearch v7.10.0
- ClickHouse 20.8.7.15
Data count: 197,938,035
Origin raw data: 82 GB, csv format data
ClickHouse storge: 16.6 GB
ElasticSearch storge:
- 87.1 GB (default mapping)
- 19.4 GB (all fields have docvalues; after removing the source field and the inverted fields that are not required for query; 5 fields such as Year, Month, etc. have integer and keyword inverted and docvalue)
ClickHouse schema:
CREATE TABLE `ontime`
(
`Year` UInt16,
`Quarter` UInt8,
`Month` UInt8,
`DayofMonth` UInt8,
`DayOfWeek` UInt8,
`FlightDate` Date,
`Reporting_Airline` String,
`DOT_ID_Reporting_Airline` Int32,
`IATA_CODE_Reporting_Airline` String,
`Tail_Number` String,
`Flight_Number_Reporting_Airline` String,
`OriginAirportID` Int32,
`OriginAirportSeqID` Int32,
`OriginCityMarketID` Int32,
`Origin` FixedString(5),
`OriginCityName` String,
`OriginState` FixedString(2),
`OriginStateFips` String,
`OriginStateName` String,
`OriginWac` Int32,
`DestAirportID` Int32,
`DestAirportSeqID` Int32,
`DestCityMarketID` Int32,
`Dest` FixedString(5),
`DestCityName` String,
`DestState` FixedString(2),
`DestStateFips` String,
`DestStateName` String,
`DestWac` Int32,
`CRSDepTime` Int32,
`DepTime` Int32,
`DepDelay` Int32,
`DepDelayMinutes` Int32,
`DepDel15` Int32,
`DepartureDelayGroups` String,
`DepTimeBlk` String,
`TaxiOut` Int32,
`WheelsOff` Int32,
`WheelsOn` Int32,
`TaxiIn` Int32,
`CRSArrTime` Int32,
`ArrTime` Int32,
`ArrDelay` Int32,
`ArrDelayMinutes` Int32,
`ArrDel15` Int32,
`ArrivalDelayGroups` Int32,
`ArrTimeBlk` String,
`Cancelled` UInt8,
`CancellationCode` FixedString(1),
`Diverted` UInt8,
`CRSElapsedTime` Int32,
`ActualElapsedTime` Int32,
`AirTime` Nullable(Int32),
`Flights` Int32,
`Distance` Int32,
`DistanceGroup` UInt8,
`CarrierDelay` Int32,
`WeatherDelay` Int32,
`NASDelay` Int32,
`SecurityDelay` Int32,
`LateAircraftDelay` Int32,
`FirstDepTime` String,
`TotalAddGTime` String,
`LongestAddGTime` String,
`DivAirportLandings` String,
`DivReachedDest` String,
`DivActualElapsedTime` String,
`DivArrDelay` String,
`DivDistance` String,
`Div1Airport` String,
`Div1AirportID` Int32,
`Div1AirportSeqID` Int32,
`Div1WheelsOn` String,
`Div1TotalGTime` String,
`Div1LongestGTime` String,
`Div1WheelsOff` String,
`Div1TailNum` String,
`Div2Airport` String,
`Div2AirportID` Int32,
`Div2AirportSeqID` Int32,
`Div2WheelsOn` String,
`Div2TotalGTime` String,
`Div2LongestGTime` String,
`Div2WheelsOff` String,
`Div2TailNum` String,
`Div3Airport` String,
`Div3AirportID` Int32,
`Div3AirportSeqID` Int32,
`Div3WheelsOn` String,
`Div3TotalGTime` String,
`Div3LongestGTime` String,
`Div3WheelsOff` String,
`Div3TailNum` String,
`Div4Airport` String,
`Div4AirportID` Int32,
`Div4AirportSeqID` Int32,
`Div4WheelsOn` String,
`Div4TotalGTime` String,
`Div4LongestGTime` String,
`Div4WheelsOff` String,
`Div4TailNum` String,
`Div5Airport` String,
`Div5AirportID` Int32,
`Div5AirportSeqID` Int32,
`Div5WheelsOn` String,
`Div5TotalGTime` String,
`Div5LongestGTime` String,
`Div5WheelsOff` String,
`Div5TailNum` String
) ENGINE = MergeTree
PARTITION BY Year
ORDER BY (IATA_CODE_Reporting_Airline, FlightDate)
SETTINGS index_granularity = 8192;
Elasticseach Schema:
{
"ontime_sort" : {
"aliases" : { },
"mappings" : {
"properties" : {
"ActualElapsedTime" : {
"type" : "integer"
},
"AirTime" : {
"type" : "integer"
},
"AirlineID" : {
"type" : "integer"
},
"ArrDel15" : {
"type" : "integer"
},
"ArrDelay" : {
"type" : "integer"
},
"ArrDelayMinutes" : {
"type" : "integer"
},
"ArrTime" : {
"type" : "integer"
},
"ArrTimeBlk" : {
"type" : "keyword"
},
"ArrivalDelayGroups" : {
"type" : "integer"
},
"CRSArrTime" : {
"type" : "integer"
},
"CRSDepTime" : {
"type" : "integer"
},
"CRSElapsedTime" : {
"type" : "integer"
},
"CancellationCode" : {
"type" : "keyword"
},
"Cancelled" : {
"type" : "byte",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"Carrier" : {
"type" : "keyword"
},
"CarrierDelay" : {
"type" : "integer"
},
"DayOfWeek" : {
"type" : "byte",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"DayofMonth" : {
"type" : "byte",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"DepDel15" : {
"type" : "integer",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"DepDelay" : {
"type" : "integer"
},
"DepDelayMinutes" : {
"type" : "integer"
},
"DepTime" : {
"type" : "integer"
},
"DepTimeBlk" : {
"type" : "keyword"
},
"DepartureDelayGroups" : {
"type" : "keyword"
},
"Dest" : {
"type" : "keyword"
},
"DestAirportID" : {
"type" : "integer"
},
"DestAirportSeqID" : {
"type" : "integer"
},
"DestCityMarketID" : {
"type" : "integer"
},
"DestCityName" : {
"type" : "keyword"
},
"DestState" : {
"type" : "keyword"
},
"DestStateFips" : {
"type" : "keyword"
},
"DestStateName" : {
"type" : "keyword"
},
"DestWac" : {
"type" : "integer"
},
"Distance" : {
"type" : "integer"
},
"DistanceGroup" : {
"type" : "byte"
},
"Div1Airport" : {
"type" : "keyword"
},
"Div1AirportID" : {
"type" : "integer"
},
"Div1AirportSeqID" : {
"type" : "integer"
},
"Div1LongestGTime" : {
"type" : "keyword"
},
"Div1TailNum" : {
"type" : "keyword"
},
"Div1TotalGTime" : {
"type" : "keyword"
},
"Div1WheelsOff" : {
"type" : "keyword"
},
"Div1WheelsOn" : {
"type" : "keyword"
},
"Div2Airport" : {
"type" : "keyword"
},
"Div2AirportID" : {
"type" : "integer"
},
"Div2AirportSeqID" : {
"type" : "integer"
},
"Div2LongestGTime" : {
"type" : "keyword"
},
"Div2TailNum" : {
"type" : "keyword"
},
"Div2TotalGTime" : {
"type" : "keyword"
},
"Div2WheelsOff" : {
"type" : "keyword"
},
"Div2WheelsOn" : {
"type" : "keyword"
},
"Div3Airport" : {
"type" : "keyword"
},
"Div3AirportID" : {
"type" : "integer"
},
"Div3AirportSeqID" : {
"type" : "integer"
},
"Div3LongestGTime" : {
"type" : "keyword"
},
"Div3TailNum" : {
"type" : "keyword"
},
"Div3TotalGTime" : {
"type" : "keyword"
},
"Div3WheelsOff" : {
"type" : "keyword"
},
"Div3WheelsOn" : {
"type" : "keyword"
},
"Div4Airport" : {
"type" : "keyword"
},
"Div4AirportID" : {
"type" : "integer"
},
"Div4AirportSeqID" : {
"type" : "integer"
},
"Div4LongestGTime" : {
"type" : "keyword"
},
"Div4TailNum" : {
"type" : "keyword"
},
"Div4TotalGTime" : {
"type" : "keyword"
},
"Div4WheelsOff" : {
"type" : "keyword"
},
"Div4WheelsOn" : {
"type" : "keyword"
},
"Div5Airport" : {
"type" : "keyword"
},
"Div5AirportID" : {
"type" : "integer"
},
"Div5AirportSeqID" : {
"type" : "integer"
},
"Div5LongestGTime" : {
"type" : "keyword"
},
"Div5TailNum" : {
"type" : "keyword"
},
"Div5TotalGTime" : {
"type" : "keyword"
},
"Div5WheelsOff" : {
"type" : "keyword"
},
"Div5WheelsOn" : {
"type" : "keyword"
},
"DivActualElapsedTime" : {
"type" : "keyword"
},
"DivAirportLandings" : {
"type" : "keyword"
},
"DivArrDelay" : {
"type" : "keyword"
},
"DivDistance" : {
"type" : "keyword"
},
"DivReachedDest" : {
"type" : "keyword"
},
"Diverted" : {
"type" : "byte"
},
"FirstDepTime" : {
"type" : "keyword"
},
"FlightDate" : {
"type" : "date",
"format" : "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||epoch_millis"
},
"FlightNum" : {
"type" : "keyword"
},
"Flights" : {
"type" : "integer"
},
"LateAircraftDelay" : {
"type" : "integer"
},
"LongestAddGTime" : {
"type" : "keyword"
},
"Month" : {
"type" : "byte",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"NASDelay" : {
"type" : "integer"
},
"Origin" : {
"type" : "keyword"
},
"OriginAirportID" : {
"type" : "integer"
},
"OriginAirportSeqID" : {
"type" : "integer"
},
"OriginCityMarketID" : {
"type" : "integer"
},
"OriginCityName" : {
"type" : "keyword"
},
"OriginState" : {
"type" : "keyword"
},
"OriginStateFips" : {
"type" : "keyword"
},
"OriginStateName" : {
"type" : "keyword"
},
"OriginWac" : {
"type" : "integer"
},
"Quarter" : {
"type" : "byte"
},
"SecurityDelay" : {
"type" : "integer"
},
"Tail_Number" : {
"type" : "text",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
},
"TaxiIn" : {
"type" : "integer"
},
"TaxiOut" : {
"type" : "integer"
},
"TotalAddGTime" : {
"type" : "keyword"
},
"UniqueCarrier" : {
"type" : "keyword"
},
"WeatherDelay" : {
"type" : "integer"
},
"WheelsOff" : {
"type" : "integer"
},
"WheelsOn" : {
"type" : "integer"
},
"Year" : {
"type" : "short",
"fields" : {
"keyword" : {
"type" : "keyword",
"ignore_above" : 256
}
}
}
}
},
"settings" : {
"index" : {
"refresh_interval" : "10s",
"indexing" : {
"slowlog" : {
"level" : "info",
"threshold" : {
"index" : {
"warn" : "200ms",
"trace" : "20ms",
"debug" : "50ms",
"info" : "100ms"
}
},
"source" : "1000"
}
},
"translog" : {
"flush_threshold_size" : "4gb",
"sync_interval" : "120s",
"durability" : "async"
},
"max_inner_result_window" : "10000",
"max_result_window" : "1000000",
"requests" : {
"cache" : {
"enable" : "false"
}
},
"sort" : {
"field" : [
"Year",
"Month"
]
},
"unassigned" : {
"node_left" : {
"delayed_timeout" : "5m"
}
},
"number_of_replicas" : "0",
"queries" : {
"cache" : {
"enabled" : "false"
}
},
"routing" : {
"allocation" : {
"include" : {
"_tier_preference" : "data_content"
}
}
},
"search" : {
"slowlog" : {
"level" : "info",
"threshold" : {
"fetch" : {
"warn" : "200ms",
"trace" : "50ms",
"debug" : "80ms",
"info" : "100ms"
},
"query" : {
"warn" : "500ms",
"trace" : "50ms",
"debug" : "100ms",
"info" : "200ms"
}
}
}
},
"number_of_shards" : "16
}
}
}
}
Query Performance
They are read_only data, elasticsearch has force_merge to one segment per shard.
Query many times, and choice the 50% took result.
As query many times, datas are all in the pagecache.
Single query latency comparison
Concurrency query comparison
Use some clients query the server concurrently, making the server full busy. And to calculate how many qps the server can reserve.
Elasticsearch query DSL
##Q0
GET /ontime_sort/_search
{
"size": 0,
"_source" : false,
"stored_fields" : "_none_",
"aggregations" : {
"groupby" : {
"composite" : {
"size" : 1000,
"sources" : [
{
"year" : {
"terms" : {
"field" : "Year.keyword",
"missing_bucket" : true,
"order" : "asc"
}
}
},
{
"month" : {
"terms" : {
"field" : "Month.keyword",
"missing_bucket" : true,
"order" : "asc"
}
}
}
]
}
}
}
}
##Q1
GET /ontime_sort/_search
{
"size": 0,
"query": {
"range": {
"Year": {
"from": 2000,
"to": 2008,
"include_lower": true,
"include_upper": true
}
}
},
"aggregations": {
"groupby_DayOfWeek": {
"terms": {
"size": 1000,
"field": "DayOfWeek.keyword"
}
}
}
}
##Q2
GET /ontime_sort/_search
{
"size": 0,
"query" : {
"bool" : {
"must" : [
{
"range" : {
"DepDelay" : {
"from" : 10,
"to" : null,
"include_lower" : false,
"include_upper" : false,
"boost" : 1.0
}
}
},
{
"range" : {
"Year" : {
"from" : 2000,
"to" : 2008,
"include_lower" : true,
"include_upper" : true,
"time_zone" : "Z",
"boost" : 1.0
}
}
}
],
"adjust_pure_negative" : true,
"boost" : 1.0
}
},
"_source" : false,
"stored_fields" : "_none_",
"aggregations": {
"groupby_DayOfWeek": {
"terms": {
"size": 1000,
"field": "DayOfWeek.keyword"
}
}
}
}
##Q3
GET /ontime_sort/_search
{
"size": 0,
"query": {
"bool": {
"must": [
{
"range": {
"DepDelay": {
"from": 10,
"to": null,
"include_lower": false,
"include_upper": false,
"boost": 1
}
}
},
{
"range": {
"Year": {
"from": 2000,
"to": 2008,
"include_lower": true,
"include_upper": true,
"time_zone": "Z",
"boost": 1
}
}
}
],
"adjust_pure_negative": true,
"boost": 1
}
},
"_source": false,
"stored_fields": "_none_",
"aggregations": {
"groupby_Origin": {
"terms": {
"size": 1000,
"field": "Origin"
}
}
}
}
##Q4
GET /ontime_sort/_search
{
"size": 0,
"query": {
"bool": {
"must": [
{
"range": {
"DepDelay": {
"from": 10,
"to": null,
"include_lower": false,
"include_upper": false,
"boost": 1
}
}
},
{
"term": {
"Year.keyword": {
"value": 2007,
"boost": 1
}
}
}
],
"adjust_pure_negative": true,
"boost": 1
}
},
"_source": false,
"stored_fields": "_none_",
"aggregations": {
"groupby_Carrier": {
"terms": {
"size": 1000,
"field": "Carrier"
}
}
}
}
##Q5
GET /ontime_sort/_search
{
"size": 0,
"query": {
"term": {
"Year.keyword": {
"value": 2007,
"boost": 1
}
}
},
"_source": false,
"stored_fields": "_none_",
"aggregations": {
"groupby_Carrier": {
"terms": {
"size": 1000,
"field": "Carrier"
},
"aggregations": {
"DepDelay": {
"avg": {
"field": "DepDelay"
}
}
}
}
}
}
##Q6
GET /ontime_sort/_search
{
"size": 0,
"query" : {
"range" : {
"Year" : {
"from" : 2000,
"to" : 2008,
"include_lower" : true,
"include_upper" : true,
"time_zone" : "Z",
"boost" : 1.0
}
}
},
"_source" : false,
"stored_fields" : "_none_",
"aggregations": {
"groupby_Carrier": {
"terms": {
"size": 1000,
"field": "Carrier"
},
"aggregations": {
"DepDelay": {
"avg": {
"field": "DepDelay"
}
}
}
}
}
}
##Q7
GET /ontime_sort/_search
{
"size": 0,
"_source": false,
"stored_fields": "_none_",
"aggregations": {
"groupby_Carrier": {
"terms": {
"size": 1000,
"field": "Year.keyword"
},
"aggregations": {
"DepDelay": {
"avg": {
"field": "DepDelay"
}
}
}
}
}
}
##Q8
GET /ontime_sort/_search
{
"size": 0,
"query": {
"range": {
"Year": {
"from": 2000,
"to": 2010,
"include_lower": true,
"include_upper": true,
"time_zone": "Z",
"boost": 1
}
}
},
"_source": false,
"stored_fields": "_none_",
"aggregations": {
"groupby_DestCityName": {
"terms": {
"size": 22,
"field": "DestCityName"
},
"aggregations": {
"OriginCityName": {
"cardinality": {
"field": "OriginCityName"
}
}
}
}
}
}
##Q9
GET /ontime_sort/_search
{
"size": 0,
"_source": false,
"stored_fields": "_none_",
"aggregations": {
"groupby_DestCityName": {
"terms": {
"size": 1000,
"field": "Year.keyword"
}
}
}
}
##Q10
GET /ontime_sort/_search
{
"size": 0,
"query": {
"bool": {
"must_not": [
{
"terms": {
"DayOfWeek.keyword": [
6,
7
]
}
},
{
"terms": {
"OriginState": [
"AK",
"HI",
"PR",
"VI"
]
}
},
{
"terms": {
"DestState": [
"AK",
"HI",
"PR",
"VI"
]
}
}
],
"must": {
"range": {
"FlightDate": {
"to": "2010-01-01",
"include_upper": false
}
}
}
}
},
"_source": false,
"stored_fields": "_none_",
"aggregations": {
"groupby": {
"terms": {
"size": 1000,
"field": "Carrier"
},
"aggregations": {
"Year": {
"stats": {
"field": "Year"
}
},
"ArrDelayMinutes": {
"avg": {
"field": "ArrDelayMinutes"
}
},
"having": {
"bucket_selector": {
"buckets_path": {
"year_count": "_count",
"max_year": "Year.max"
},
"script": "params.year_count>100000 && params.max_year>1990",
"gap_policy": "skip"
}
}
}
}
}
}
##Q11
GET /ontime_sort/_search
{
"size": 0,
"query" : {
"term" : {
"DepDel15.keyword": {
"value" : "1.00",
"boost" : 1.0
}
}
},
"_source" : false,
"stored_fields" : "_none_",
"aggregations" : {
"groupby" : {
"composite" : {
"size" : 1000,
"sources" : [
{
"Year" : {
"terms" : {
"field" : "Year.keyword",
"missing_bucket" : true,
"order" : "asc"
}
}
},
{
"Month" : {
"terms" : {
"field" : "Month.keyword",
"missing_bucket" : true,
"order" : "asc"
}
}
}
]
}
}
}
}
##Q12
GET /ontime_sort/_search
{
"size": 0,
"_source" : false,
"stored_fields" : "_none_",
"aggregations" : {
"groupby" : {
"composite" : {
"size" : 1000,
"sources" : [
{
"Year" : {
"terms" : {
"field" : "Year.keyword",
"missing_bucket" : true,
"order" : "asc"
}
}
},
{
"Month" : {
"terms" : {
"field" : "Month.keyword",
"missing_bucket" : true,
"order" : "asc"
}
}
}
]
}
}
}
}
##Q13
GET /ontime_sort/_search
{
"size": 0,
"_source": false,
"stored_fields": "_none_",
"aggregations": {
"groupby_Carrier": {
"terms": {
"size": 1000,
"field": "DestCityName"
},
"aggregations": {
"OriginCityName": {
"cardinality": {
"field": "OriginCityName"
}
}
}
}
}
}
##Q14
GET /ontime_sort/_search
{
"size": 0,
"_source": false,
"stored_fields": "_none_",
"aggregations": {
"groupby_OriginCityName": {
"terms": {
"size": 100,
"field": "OriginCityName"
},
"aggregations": {
"groupby_DestCityName": {
"terms": {
"size": 10,
"field": "DestCityName"
}
}
}
}
}
}
ClickHouse query SQL
--Q0
SELECT Year, Month, count(*) AS c1
FROM ontime
GROUP BY Year, Month;
--Q1 The number of flights per day from the year 2000 to 2008
SELECT DayOfWeek, count(*) AS c
FROM ontime
WHERE Year>=2000 AND Year<=2008
GROUP BY DayOfWeek
ORDER BY c DESC;
--Q2 The number of flights delayed by more than 10 minutes, grouped by the day of the week, for 2000-2008
SELECT DayOfWeek, count(*) AS c
FROM ontime
WHERE DepDelay>10 AND Year>=2000 AND Year<=2008
GROUP BY DayOfWeek
ORDER BY c DESC;
--Q3 The number of delays by airport for 2000-2008
SELECT Origin, count(*) AS c
FROM ontime
WHERE DepDelay>10 AND Year>=2000 AND Year<=2008
GROUP BY Origin
ORDER BY c DESC
LIMIT 10;
--Q4 The number of delays by carrier for 2007
SELECT Carrier, count(*) AS c
FROM ontime
WHERE DepDelay>10 AND Year=2007
GROUP BY Carrier
ORDER BY count(*) DESC;
--Q5
SELECT Carrier, avg(DepDelay) AS c3
FROM ontime
WHERE Year=2007
GROUP BY Carrier
ORDER BY c3 DESC;
--Q6 The previous request for a broader range of years, 2000-2008
SELECT Carrier, avg(DepDelay) AS c3
FROM ontime
WHERE Year>=2000 AND Year<=2008
GROUP BY Carrier
ORDER BY c3 DESC;
--Q7 Percentage of flights delayed for more than 10 minutes, by year
SELECT Year, avg(DepDelay)
FROM ontime
GROUP BY Year
ORDER BY Year;
--Q8 The most popular destinations by the number of directly connected cities for various year ranges
SELECT DestCityName, count(distinct OriginCityName) AS u
FROM ontime
WHERE Year>=2000 and Year<=2010
GROUP BY DestCityName
ORDER BY u DESC LIMIT 10;
--Q9
SELECT Year, count(*) AS c1
FROM ontime
GROUP BY Year;
--Q10
SELECT
min(Year), max(Year), Carrier, count(*) AS cnt,
avg(ArrDelayMinutes) AS rate
FROM ontime
WHERE
DayOfWeek NOT IN (6,7) AND OriginState NOT IN ('AK', 'HI', 'PR', 'VI')
AND DestState NOT IN ('AK', 'HI', 'PR', 'VI')
AND FlightDate < '2010-01-01'
GROUP by Carrier
HAVING cnt>100000 and max(Year)>1990
ORDER by rate DESC
LIMIT 1000;
--Q11
SELECT Year,Month,count(*) AS cnt
FROM ontime
WHERE DepDel15=1
GROUP BY Year,Month;
--Q12
SELECT Year,Month,count(*) AS c1
FROM ontime
GROUP BY Year,Month;
--Q13
SELECT DestCityName, uniqExact(OriginCityName) AS u
FROM ontime
GROUP BY DestCityName
ORDER BY u DESC
LIMIT 10;
--Q14
SELECT OriginCityName, DestCityName, count() AS c
FROM ontime
GROUP BY OriginCityName, DestCityName
ORDER BY c DESC
LIMIT 10;
Load Performance
The load performance result is from a ClickHouse team, we didn't record the load performance. But we try to load the ontime data, the ClickHouse is real 10 times faster then Elasticsearch.
conclusion: