本节继续介绍排序的实现,上一节介绍了ExecSort,本节介绍该函数中调用的排序的具体实现函数,本节是第一部分,包括tuplesort_begin_heap/tuplesort_puttupleslot.
SortState
排序运行期状态信息
/* ----------------
* SortState information
* 排序运行期状态信息
* ----------------
*/
typedef struct SortState
{
//基类
ScanState ss; /* its first field is NodeTag */
//是否需要随机访问排序输出?
bool randomAccess; /* need random access to sort output? */
//结果集是否存在边界?
bool bounded; /* is the result set bounded? */
//如存在边界,需要多少个元组?
int64 bound; /* if bounded, how many tuples are needed */
//是否已完成排序?
bool sort_Done; /* sort completed yet? */
//是否使用有界值?
bool bounded_Done; /* value of bounded we did the sort with */
//使用的有界值?
int64 bound_Done; /* value of bound we did the sort with */
//tuplesort.c的私有状态
void *tuplesortstate; /* private state of tuplesort.c */
//是否worker?
bool am_worker; /* are we a worker? */
//每个worker对应一个条目
SharedSortInfo *shared_info; /* one entry per worker */
} SortState;
/* ----------------
* Shared memory container for per-worker sort information
* per-worker排序信息的共享内存容器
* ----------------
*/
typedef struct SharedSortInfo
{
//worker个数?
int num_workers;
//排序机制
TuplesortInstrumentation sinstrument[FLEXIBLE_ARRAY_MEMBER];
} SharedSortInfo;
TuplesortInstrumentation
报告排序统计的数据结构.
/*
* Data structures for reporting sort statistics. Note that
* TuplesortInstrumentation can't contain any pointers because we
* sometimes put it in shared memory.
* 报告排序统计的数据结构.
* 注意TuplesortInstrumentation不能包含指针因为有时候会把该结构体放在共享内存中.
*/
typedef enum
{
SORT_TYPE_STILL_IN_PROGRESS = 0,//仍然在排序中
SORT_TYPE_TOP_N_HEAPSORT,//TOP N 堆排序
SORT_TYPE_QUICKSORT,//快速排序
SORT_TYPE_EXTERNAL_SORT,//外部排序
SORT_TYPE_EXTERNAL_MERGE//外部排序后的合并
} TuplesortMethod;//排序方法
typedef enum
{
SORT_SPACE_TYPE_DISK,//需要用上磁盘
SORT_SPACE_TYPE_MEMORY//使用内存
} TuplesortSpaceType;
typedef struct TuplesortInstrumentation
{
//使用的排序算法
TuplesortMethod sortMethod; /* sort algorithm used */
//排序使用空间类型
TuplesortSpaceType spaceType; /* type of space spaceUsed represents */
//空间消耗(以K为单位)
long spaceUsed; /* space consumption, in kB */
} TuplesortInstrumentation;
tuplesort_begin_heap和tuplesort_puttupleslot都是准备工作,把tuple放到数组(堆)中,为后续的实际排序实现作准备.
tuplesort_begin_heap
Tuplesortstate *
tuplesort_begin_heap(TupleDesc tupDesc,//元组描述符
int nkeys, //排序键个数
AttrNumber *attNums,//属性编号
Oid *sortOperators, //排序操作符
Oid *sortCollations,//排序规则
bool *nullsFirstFlags,//标记
int workMem, //内存大小
SortCoordinate coordinate,//协调器
bool randomAccess)//是否随机访问
{
//获取排序状态
Tuplesortstate *state = tuplesort_begin_common(workMem, coordinate,
randomAccess);
MemoryContext oldcontext;
int i;
oldcontext = MemoryContextSwitchTo(state->sortcontext);
AssertArg(nkeys > 0);
#ifdef TRACE_SORT
if (trace_sort)
elog(LOG,
"begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c",
nkeys, workMem, randomAccess ? 't' : 'f');
#endif
state->nKeys = nkeys;
TRACE_POSTGRESQL_SORT_START(HEAP_SORT,
false, /* no unique check */
nkeys,
workMem,
randomAccess,
PARALLEL_SORT(state));
//设置运行状态
state->comparetup = comparetup_heap;
state->copytup = copytup_heap;
state->writetup = writetup_heap;
state->readtup = readtup_heap;
//假定不需要拷贝元组描述符
state->tupDesc = tupDesc; /* assume we need not copy tupDesc */
state->abbrevNext = 10;
/* Prepare SortSupport data for each column */
//为每一列准备SortSupport数据(分配内存空间)
state->sortKeys = (SortSupport) palloc0(nkeys * sizeof(SortSupportData));
for (i = 0; i < nkeys; i++)
{
//------- 遍历排序键
//排序键
SortSupport sortKey = state->sortKeys + i;
AssertArg(attNums[i] != 0);
AssertArg(sortOperators[i] != 0);
//设置SortSupport
sortKey->ssup_cxt = CurrentMemoryContext;
sortKey->ssup_collation = sortCollations[i];
sortKey->ssup_nulls_first = nullsFirstFlags[i];
sortKey->ssup_attno = attNums[i];
/* Convey if abbreviation optimization is applicable in principle */
//缩写优化是否原则上可用?
sortKey->abbreviate = (i == 0);
//设置
PrepareSortSupportFromOrderingOp(sortOperators[i], sortKey);
}
/*
* The "onlyKey" optimization cannot be used with abbreviated keys, since
* tie-breaker comparisons may be required. Typically, the optimization
* is only of value to pass-by-value types anyway, whereas abbreviated
* keys are typically only of value to pass-by-reference types.
* 对于缩写优化"onlyKey"优化不能使用,因为可能需要tie-breaker比较.
* 典型的,优化器只对按值传递类型有价值,而缩写建通常只对引用传递类型有价值.
*/
if (nkeys == 1 && !state->sortKeys->abbrev_converter)
state->onlyKey = state->sortKeys;
MemoryContextSwitchTo(oldcontext);
return state;
}
tuplesort_puttupleslot
接收一个元组(一行)
/*
* Accept one tuple while collecting input data for sort.
* 接收一个元组(一行)
*
* Note that the input data is always copied; the caller need not save it.
* 注意输入数据通常会被拷贝,调用者不需要保存这些数据.
*/
void
tuplesort_puttupleslot(Tuplesortstate *state, TupleTableSlot *slot)
{
MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
SortTuple stup;
/*
* Copy the given tuple into memory we control, and decrease availMem.
* Then call the common code.
* 拷贝给定的元组在我们控制的内存中,同时减少可用内存.
* 然后调用puttuple_common函数.
*/
//#define COPYTUP(state,stup,tup) ((*(state)->copytup) (state, stup, tup))
COPYTUP(state, &stup, (void *) slot);
puttuple_common(state, &stup);
MemoryContextSwitchTo(oldcontext);
}
/*
* Shared code for tuple and datum cases.
* 元组和datum共享的代码.
*/
static void
puttuple_common(Tuplesortstate *state, SortTuple *tuple)
{
Assert(!LEADER(state));
switch (state->status)
{
case TSS_INITIAL://初始化
/*
* Save the tuple into the unsorted array. First, grow the array
* as needed. Note that we try to grow the array when there is
* still one free slot remaining --- if we fail, there'll still be
* room to store the incoming tuple, and then we'll switch to
* tape-based operation.
* 在未排序的数组中存储元组.
* 首先,如需要则扩展数组大小.注意在只有1个slot剩下来的时候才尝试扩展数组.
* 假如扩展失败,仍有空间可用于存储输入的元组,然后将切换至tape-based(基于磁带)操作.
*/
if (state->memtupcount >= state->memtupsize - 1)
{
(void) grow_memtuples(state);
Assert(state->memtupcount < state->memtupsize);
}
state->memtuples[state->memtupcount++] = *tuple;//在数组中保存元组
/*
* Check if it's time to switch over to a bounded heapsort. We do
* so if the input tuple count exceeds twice the desired tuple
* count (this is a heuristic for where heapsort becomes cheaper
* than a quicksort), or if we've just filled workMem and have
* enough tuples to meet the bound.
* 检查是否切换至有界heapsort.
* 在输入元组个数超出期望元组个数两倍的情况下执行该检查
* (在堆排序比快速排序成本更低时所获得的洞见),
* 或者如果我们已经填充了workMem并且有足够的元组满足bound时也执行该检查.
*
* Note that once we enter TSS_BOUNDED state we will always try to
* complete the sort that way. In the worst case, if later input
* tuples are larger than earlier ones, this might cause us to
* exceed workMem significantly.
* 注意我们一旦进入TSS_BOUNDED状态,将尝试按指定的方式完成排序.
* 在最坏的情况下,如果后续的输入元组比早前的要大,这可能会导致内存大小会超出workMem.
*/
if (state->bounded &&
(state->memtupcount > state->bound * 2 ||
(state->memtupcount > state->bound && LACKMEM(state))))
{
#ifdef TRACE_SORT
if (trace_sort)
elog(LOG, "switching to bounded heapsort at %d tuples: %s",
state->memtupcount,
pg_rusage_show(&state->ru_start));
#endif
//切换至堆排序
make_bounded_heap(state);
return;
}
/*
* Done if we still fit in available memory and have array slots.
* 如果内存空间可用并且数组有位置存储,则已完成任务!
*/
if (state->memtupcount < state->memtupsize && !LACKMEM(state))
return;
/*
* Nope; time to switch to tape-based operation.
* 切换至tape-base操作
*/
inittapes(state, true);
/*
* Dump all tuples.
* dump所有元组
*/
dumptuples(state, false);
break;
case TSS_BOUNDED://有界堆排序
/*
* We don't want to grow the array here, so check whether the new
* tuple can be discarded before putting it in. This should be a
* good speed optimization, too, since when there are many more
* input tuples than the bound, most input tuples can be discarded
* with just this one comparison. Note that because we currently
* have the sort direction reversed, we must check for <= not >=.
* 不希望在这里扩展数组,因此检查在放到数组前检查是否可用废弃某些新元组.
* 这将是一个很好的性能优化,因为存在比边界要大得多的输入元组,
* 大多数输入元组会通过对比被废弃.
*/
if (COMPARETUP(state, tuple, &state->memtuples[0]) <= 0)
{
/* new tuple <= top of the heap, so we can discard it */
// 新元组 <= 堆顶,可以废弃
free_sort_tuple(state, tuple);
CHECK_FOR_INTERRUPTS();
}
else
{
/* discard top of heap, replacing it with the new tuple */
//废弃堆顶,使用新元组替换
free_sort_tuple(state, &state->memtuples[0]);
tuplesort_heap_replace_top(state, tuple);
}
break;
case TSS_BUILDRUNS://构建运行期信息
/*
* Save the tuple into the unsorted array (there must be space)
* 保存元组到未排序数组中(存在空闲空间)
*/
state->memtuples[state->memtupcount++] = *tuple;
/*
* If we are over the memory limit, dump all tuples.
* 如果已超出内存限制,则dump所有元组
*/
dumptuples(state, false);
break;
default:
elog(ERROR, "invalid tuplesort state");
break;
}
}
N/A
N/A
免责声明:本站发布的内容(图片、视频和文字)以原创、转载和分享为主,文章观点不代表本网站立场,如果涉及侵权请联系站长邮箱:is@yisu.com进行举报,并提供相关证据,一经查实,将立刻删除涉嫌侵权内容。