diff options
author | Michael Paquier | 2012-07-24 07:13:55 +0000 |
---|---|---|
committer | Michael Paquier | 2012-07-24 07:35:37 +0000 |
commit | d03ea805cef9375bee9b751e65d698c07c138bf5 (patch) | |
tree | 2e578af76c1ac515887ff0363f5224f57af64a92 /src/include | |
parent | baa8c4a51cdd7de321169f12ebfb47b02fed3afc (diff) |
Support for online data redistribution with ALTER TABLE
Online data redistribution is the possibility for a user to change the distribution
strategy of a table. There are no restrictions in the modifications possible, meaning
that all types of tables with all possible node subsets can be completely changed in
one command.
The SQL command used for redistribution is an extension of ALTER TABLE with those
clauses specific to XC and already available in CREATE TABLE:
DISTRIBUTE BY { REPLICATION | ROUND ROBIN | { [HASH | MODULO ] ( column_name ) } }
TO { GROUP groupname | NODE ( nodename [, ... ] ) }
ADD NODE ( nodename [, ... ] )
DELETE NODE ( nodename [, ... ] )
Those commands can be combined together without limitations.
Several redistribution scenarios are implemented depending on the old and new
distribution type of the table:
- Default scenario:
1) Fetch the data of the table with a COPY TO and store it inside a tuplestore
2) Perform a TRUNCATE on the Datanodes
3) Perform a COPY TO with tuples inside tuplestore
4) REINDEX table if necessary
This default scenario could also be managed by an external tool, however all
the following optimizations need a node-level control to perform with highest
efficiency possible. The performance of this scenario is equivalent to running
a COPY TO/COPY FROM sequence on a table, so here performance is not bounded by
the redistribution mechanism itself but by the COPY protocol used for data exchanged
in network.
- Replicated to replicated:
In case of nodes removed from the set of nodes, those nodes are simply truncated,
so this is really quick even on large sets of data.
For new nodes, data is fetched on Coordinator from one Datanode with COPY TO,
data is stored in a tuplestore, and then COPY FROM is launched only on the new
nodes.
- Replicated to distributed:
If new nodes are added, a fallback to default scenario is made.
If nodes are removed, those nodes are truncated.
Finally, on the remaining nodes a DELETE query removing only the necessary tuples
is launched to each remote node. In this case there is no data exchanged between
nodes so performance is maximized.
In order to support all those scenarios, a couple of new internal mechanisms have
been added to XC: materialization on Coordinator of tuple slots and possibility
to reuse them for redistribution purposes, externalization of a portion of
PostgreSQL COPY code used by redistribution, reuse and extension of Postgres-XC
APIs for remote COPY management.
The tuplestore used to store tuples if necessary can have its allowed cache
controlled with work_mem. The only thing to take care of is that the tuplestore
data needs to be stored on Coordinator once so some additional disk space might
be necessary on this server to perform redistribution correctly.
Documentation, as well as a new set of regression tests have been added.
Regressions do checks on views, prepared statementsm, views, distribution types
and subsets in a way completely transparent whatever the cluster configuration.
Diffstat (limited to 'src/include')
-rw-r--r-- | src/include/access/hash.h | 1 | ||||
-rw-r--r-- | src/include/catalog/pgxc_class.h | 25 | ||||
-rw-r--r-- | src/include/nodes/parsenodes.h | 6 | ||||
-rw-r--r-- | src/include/pgxc/copyops.h | 27 | ||||
-rw-r--r-- | src/include/pgxc/execRemote.h | 17 | ||||
-rw-r--r-- | src/include/pgxc/locator.h | 1 | ||||
-rw-r--r-- | src/include/pgxc/redistrib.h | 80 | ||||
-rw-r--r-- | src/include/pgxc/remotecopy.h | 1 | ||||
-rw-r--r-- | src/include/utils/rel.h | 8 |
9 files changed, 159 insertions, 7 deletions
diff --git a/src/include/access/hash.h b/src/include/access/hash.h index bc7006dfb3..777a9369aa 100644 --- a/src/include/access/hash.h +++ b/src/include/access/hash.h @@ -358,6 +358,7 @@ extern void hash_desc(StringInfo buf, uint8 xl_info, char *rec); #ifdef PGXC extern Datum compute_hash(Oid type, Datum value, char locator); +extern char *get_compute_hash_function(Oid type, char locator); #endif #endif /* HASH_H */ diff --git a/src/include/catalog/pgxc_class.h b/src/include/catalog/pgxc_class.h index 5a0cd597d3..cb540bf584 100644 --- a/src/include/catalog/pgxc_class.h +++ b/src/include/catalog/pgxc_class.h @@ -22,22 +22,37 @@ CATALOG(pgxc_class,9001) BKI_WITHOUT_OIDS typedef FormData_pgxc_class *Form_pgxc_class; -#define Natts_pgxc_class 6 +#define Natts_pgxc_class 6 -#define Anum_pgxc_class_pcrelid 1 +#define Anum_pgxc_class_pcrelid 1 #define Anum_pgxc_class_pclocatortype 2 -#define Anum_pgxc_class_pcattnum 3 +#define Anum_pgxc_class_pcattnum 3 #define Anum_pgxc_class_pchashalgorithm 4 #define Anum_pgxc_class_pchashbuckets 5 -#define Anum_pgxc_class_nodes 6 +#define Anum_pgxc_class_nodes 6 + +typedef enum PgxcClassAlterType +{ + PGXC_CLASS_ALTER_DISTRIBUTION, + PGXC_CLASS_ALTER_NODES, + PGXC_CLASS_ALTER_ALL +} PgxcClassAlterType; extern void PgxcClassCreate(Oid pcrelid, - char pclocatortype, + char pclocatortype, int pcattnum, int pchashalgorithm, int pchashbuckets, int numnodes, Oid *nodes); +extern void PgxcClassAlter(Oid pcrelid, + char pclocatortype, + int pcattnum, + int pchashalgorithm, + int pchashbuckets, + int numnodes, + Oid *nodes, + PgxcClassAlterType type); extern void RemovePgxcClass(Oid pcrelid); #endif /* PGXC_CLASS_H */ diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index e8f2317c1b..8a837b39d5 100644 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -1244,6 +1244,12 @@ typedef enum AlterTableType AT_DropInherit, /* NO INHERIT parent */ AT_AddOf, /* OF <type_name> */ AT_DropOf, /* NOT OF */ +#ifdef PGXC + AT_DistributeBy, /* DISTRIBUTE BY ... */ + AT_SubCluster, /* TO [ NODE nodelist | GROUP groupname ] */ + AT_AddNodeList, /* ADD NODE nodelist */ + AT_DeleteNodeList, /* DELETE NODE nodelist */ +#endif AT_GenericOptions /* OPTIONS (...) */ } AlterTableType; diff --git a/src/include/pgxc/copyops.h b/src/include/pgxc/copyops.h new file mode 100644 index 0000000000..862dbbd299 --- /dev/null +++ b/src/include/pgxc/copyops.h @@ -0,0 +1,27 @@ +/*-------------------------------------------------------------------------- + * + * copyops.h + * Routines for manipulation of remote COPY data + * + * + * Copyright (c) 2010-2012 Postgres-XC Development Group + * + * + * IDENTIFICATION + * src/include/pgxc/copyops.h + * + *------------------------------------------------------------------------- + */ + +#ifndef COPYOPS_H +#define COPYOPS_H + +#include "access/tupdesc.h" + +/* Type of data delimiter used for data redistribution using remote COPY */ +#define COPYOPS_DELIMITER '\t' + +extern char **CopyOps_RawDataToArrayField(TupleDesc tupdesc, char *message, int len); +extern char *CopyOps_BuildOneRowTo(TupleDesc tupdesc, Datum *values, bool *nulls, int *len); + +#endif diff --git a/src/include/pgxc/execRemote.h b/src/include/pgxc/execRemote.h index 32a88ecca4..5e26850d1c 100644 --- a/src/include/pgxc/execRemote.h +++ b/src/include/pgxc/execRemote.h @@ -48,6 +48,17 @@ typedef enum REQUEST_TYPE_COPY_OUT /* Copy Out response */ } RequestType; +/* + * Type of requests associated to a remote COPY OUT + */ +typedef enum +{ + REMOTE_COPY_NONE, /* Not defined yet */ + REMOTE_COPY_STDOUT, /* Send back to client */ + REMOTE_COPY_FILE, /* Write in file */ + REMOTE_COPY_TUPLESTORE /* Store data in tuplestore */ +} RemoteCopyType; + /* Combines results of INSERT statements using multiple values */ typedef struct CombineTag { @@ -107,7 +118,8 @@ typedef struct RemoteQueryState /* Simple DISTINCT support */ FmgrInfo *eqfunctions; /* functions to compare tuples */ MemoryContext tmp_ctx; /* separate context is needed to compare tuples */ - FILE *copy_file; /* used if copy_dest == COPY_FILE */ + RemoteCopyType remoteCopyType; /* Type of remote COPY operation */ + FILE *copy_file; /* used if remoteCopyType == REMOTE_COPY_FILE */ uint64 processed; /* count of data rows when running CopyOut */ /* cursor support */ char *cursor; /* cursor name */ @@ -136,7 +148,8 @@ extern void PGXCNodeCommitPrepared(char *gid); /* Copy command just involves Datanodes */ extern PGXCNodeHandle** DataNodeCopyBegin(const char *query, List *nodelist, Snapshot snapshot); extern int DataNodeCopyIn(char *data_row, int len, ExecNodes *exec_nodes, PGXCNodeHandle** copy_connections); -extern uint64 DataNodeCopyOut(ExecNodes *exec_nodes, PGXCNodeHandle** copy_connections, FILE* copy_file); +extern uint64 DataNodeCopyOut(ExecNodes *exec_nodes, PGXCNodeHandle** copy_connections, TupleDesc tupleDesc, + FILE* copy_file, Tuplestorestate *store, RemoteCopyType remoteCopyType); extern void DataNodeCopyFinish(PGXCNodeHandle** copy_connections, int primary_dn_index, CombineType combine_type); extern bool DataNodeCopyEnd(PGXCNodeHandle *handle, bool is_error); extern int DataNodeCopyInBinaryForAll(char *msg_buf, int len, PGXCNodeHandle** copy_connections); diff --git a/src/include/pgxc/locator.h b/src/include/pgxc/locator.h index bd719911ea..78ce3cff00 100644 --- a/src/include/pgxc/locator.h +++ b/src/include/pgxc/locator.h @@ -99,6 +99,7 @@ extern RelationLocInfo *GetRelationLocInfo(Oid relid); extern RelationLocInfo *CopyRelationLocInfo(RelationLocInfo *src_info); extern char GetRelationLocType(Oid relid); extern bool IsTableDistOnPrimary(RelationLocInfo *rel_loc_info); +extern bool IsLocatorInfoEqual(RelationLocInfo *rel_loc_info1, RelationLocInfo *rel_loc_info2); extern ExecNodes *GetRelationNodes(RelationLocInfo *rel_loc_info, Datum valueForDistCol, bool isValueNull, Oid typeOfValueForDistCol, RelationAccessType accessType); diff --git a/src/include/pgxc/redistrib.h b/src/include/pgxc/redistrib.h new file mode 100644 index 0000000000..ee94523dbb --- /dev/null +++ b/src/include/pgxc/redistrib.h @@ -0,0 +1,80 @@ +/*------------------------------------------------------------------------- + * + * redistrib.h + * Routines related to online data redistribution + * + * Copyright (c) 2010-2012 Postgres-XC Development Group + * + * + * IDENTIFICATION + * src/include/pgxc/redistrib.h + * + *------------------------------------------------------------------------- + */ + +#ifndef REDISTRIB_H +#define REDISTRIB_H + +#include "nodes/parsenodes.h" +#include "utils/tuplestore.h" + +/* + * Type of data redistribution operations. + * Online data redistribution is made of one or more of those operations. + */ +typedef enum RedistribOperation { + DISTRIB_NONE, /* Default operation */ + DISTRIB_DELETE_HASH, /* Perform a DELETE with hash value check */ + DISTRIB_DELETE_MODULO, /* Perform a DELETE with modulo value check */ + DISTRIB_COPY_TO, /* Perform a COPY TO */ + DISTRIB_COPY_FROM, /* Perform a COPY FROM */ + DISTRIB_TRUNCATE, /* Truncate relation */ + DISTRIB_REINDEX /* Reindex relation */ +} RedistribOperation; + +/* + * Determine if operation can be done before or after + * catalog update on local node. + */ +typedef enum RedistribCatalog { + CATALOG_UPDATE_NONE, /* Default state */ + CATALOG_UPDATE_AFTER, /* After catalog update */ + CATALOG_UPDATE_BEFORE, /* Before catalog update */ + CATALOG_UPDATE_BOTH /* Before and after catalog update */ +} RedistribCatalog; + +/* + * Redistribution command + * This contains the tools necessary to perform a redistribution operation. + */ +typedef struct RedistribCommand { + RedistribOperation type; /* Operation type */ + ExecNodes *execNodes; /* List of nodes where to perform operation */ + RedistribCatalog updateState; /* Flag to determine if operation can be done + * before or after catalog update */ +} RedistribCommand; + +/* + * Redistribution operation state + * Maintainer of redistribution state having the list of commands + * to be performed during redistribution. + * For the list of commands, we use an array and not a simple list as operations + * might need to be done in a certain order. + */ +typedef struct RedistribState { + Oid relid; /* Oid of relation redistributed */ + List *commands; /* List of commands */ + Tuplestorestate *store; /* Tuple store used for temporary data storage */ +} RedistribState; + +extern void PGXCRedistribTable(RedistribState *distribState, RedistribCatalog type); +extern void PGXCRedistribCreateCommandList(RedistribState *distribState, + RelationLocInfo *newLocInfo); +extern RedistribCommand *makeRedistribCommand(RedistribOperation type, + RedistribCatalog updateState, + ExecNodes *nodes); +extern RedistribState *makeRedistribState(Oid relOid); +extern void FreeRedistribState(RedistribState *state); +extern void FreeRedistribCommand(RedistribCommand *command); + +#endif /* REDISTRIB_H */ diff --git a/src/include/pgxc/remotecopy.h b/src/include/pgxc/remotecopy.h index 77134e71f9..93368c0ada 100644 --- a/src/include/pgxc/remotecopy.h +++ b/src/include/pgxc/remotecopy.h @@ -70,6 +70,7 @@ extern void RemoteCopy_BuildStatement(RemoteCopyData *state, extern void RemoteCopy_GetRelationLoc(RemoteCopyData *state, Relation rel, List *attnums); +extern RemoteCopyOptions *makeRemoteCopyOptions(void); extern void FreeRemoteCopyData(RemoteCopyData *state); extern void FreeRemoteCopyOptions(RemoteCopyOptions *options); #endif diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h index fde1467185..4eaabe6592 100644 --- a/src/include/utils/rel.h +++ b/src/include/utils/rel.h @@ -365,6 +365,14 @@ typedef struct StdRdOptions #define RelationUsesTempNamespace(relation) \ ((relation)->rd_rel->relpersistence == RELPERSISTENCE_TEMP) +#ifdef PGXC +/* + * RelationGetLocInfo + * Return the location info of relation + */ +#define RelationGetLocInfo(relation) ((relation)->rd_locator_info) +#endif + /* * RELATION_IS_LOCAL * If a rel is either temp or newly created in the current transaction, |