summaryrefslogtreecommitdiff
path: root/src/include
diff options
context:
space:
mode:
authorMichael Paquier2012-07-24 07:13:55 +0000
committerMichael Paquier2012-07-24 07:35:37 +0000
commitd03ea805cef9375bee9b751e65d698c07c138bf5 (patch)
tree2e578af76c1ac515887ff0363f5224f57af64a92 /src/include
parentbaa8c4a51cdd7de321169f12ebfb47b02fed3afc (diff)
Support for online data redistribution with ALTER TABLE
Online data redistribution is the possibility for a user to change the distribution strategy of a table. There are no restrictions in the modifications possible, meaning that all types of tables with all possible node subsets can be completely changed in one command. The SQL command used for redistribution is an extension of ALTER TABLE with those clauses specific to XC and already available in CREATE TABLE: DISTRIBUTE BY { REPLICATION | ROUND ROBIN | { [HASH | MODULO ] ( column_name ) } } TO { GROUP groupname | NODE ( nodename [, ... ] ) } ADD NODE ( nodename [, ... ] ) DELETE NODE ( nodename [, ... ] ) Those commands can be combined together without limitations. Several redistribution scenarios are implemented depending on the old and new distribution type of the table: - Default scenario: 1) Fetch the data of the table with a COPY TO and store it inside a tuplestore 2) Perform a TRUNCATE on the Datanodes 3) Perform a COPY TO with tuples inside tuplestore 4) REINDEX table if necessary This default scenario could also be managed by an external tool, however all the following optimizations need a node-level control to perform with highest efficiency possible. The performance of this scenario is equivalent to running a COPY TO/COPY FROM sequence on a table, so here performance is not bounded by the redistribution mechanism itself but by the COPY protocol used for data exchanged in network. - Replicated to replicated: In case of nodes removed from the set of nodes, those nodes are simply truncated, so this is really quick even on large sets of data. For new nodes, data is fetched on Coordinator from one Datanode with COPY TO, data is stored in a tuplestore, and then COPY FROM is launched only on the new nodes. - Replicated to distributed: If new nodes are added, a fallback to default scenario is made. If nodes are removed, those nodes are truncated. Finally, on the remaining nodes a DELETE query removing only the necessary tuples is launched to each remote node. In this case there is no data exchanged between nodes so performance is maximized. In order to support all those scenarios, a couple of new internal mechanisms have been added to XC: materialization on Coordinator of tuple slots and possibility to reuse them for redistribution purposes, externalization of a portion of PostgreSQL COPY code used by redistribution, reuse and extension of Postgres-XC APIs for remote COPY management. The tuplestore used to store tuples if necessary can have its allowed cache controlled with work_mem. The only thing to take care of is that the tuplestore data needs to be stored on Coordinator once so some additional disk space might be necessary on this server to perform redistribution correctly. Documentation, as well as a new set of regression tests have been added. Regressions do checks on views, prepared statementsm, views, distribution types and subsets in a way completely transparent whatever the cluster configuration.
Diffstat (limited to 'src/include')
-rw-r--r--src/include/access/hash.h1
-rw-r--r--src/include/catalog/pgxc_class.h25
-rw-r--r--src/include/nodes/parsenodes.h6
-rw-r--r--src/include/pgxc/copyops.h27
-rw-r--r--src/include/pgxc/execRemote.h17
-rw-r--r--src/include/pgxc/locator.h1
-rw-r--r--src/include/pgxc/redistrib.h80
-rw-r--r--src/include/pgxc/remotecopy.h1
-rw-r--r--src/include/utils/rel.h8
9 files changed, 159 insertions, 7 deletions
diff --git a/src/include/access/hash.h b/src/include/access/hash.h
index bc7006dfb3..777a9369aa 100644
--- a/src/include/access/hash.h
+++ b/src/include/access/hash.h
@@ -358,6 +358,7 @@ extern void hash_desc(StringInfo buf, uint8 xl_info, char *rec);
#ifdef PGXC
extern Datum compute_hash(Oid type, Datum value, char locator);
+extern char *get_compute_hash_function(Oid type, char locator);
#endif
#endif /* HASH_H */
diff --git a/src/include/catalog/pgxc_class.h b/src/include/catalog/pgxc_class.h
index 5a0cd597d3..cb540bf584 100644
--- a/src/include/catalog/pgxc_class.h
+++ b/src/include/catalog/pgxc_class.h
@@ -22,22 +22,37 @@ CATALOG(pgxc_class,9001) BKI_WITHOUT_OIDS
typedef FormData_pgxc_class *Form_pgxc_class;
-#define Natts_pgxc_class 6
+#define Natts_pgxc_class 6
-#define Anum_pgxc_class_pcrelid 1
+#define Anum_pgxc_class_pcrelid 1
#define Anum_pgxc_class_pclocatortype 2
-#define Anum_pgxc_class_pcattnum 3
+#define Anum_pgxc_class_pcattnum 3
#define Anum_pgxc_class_pchashalgorithm 4
#define Anum_pgxc_class_pchashbuckets 5
-#define Anum_pgxc_class_nodes 6
+#define Anum_pgxc_class_nodes 6
+
+typedef enum PgxcClassAlterType
+{
+ PGXC_CLASS_ALTER_DISTRIBUTION,
+ PGXC_CLASS_ALTER_NODES,
+ PGXC_CLASS_ALTER_ALL
+} PgxcClassAlterType;
extern void PgxcClassCreate(Oid pcrelid,
- char pclocatortype,
+ char pclocatortype,
int pcattnum,
int pchashalgorithm,
int pchashbuckets,
int numnodes,
Oid *nodes);
+extern void PgxcClassAlter(Oid pcrelid,
+ char pclocatortype,
+ int pcattnum,
+ int pchashalgorithm,
+ int pchashbuckets,
+ int numnodes,
+ Oid *nodes,
+ PgxcClassAlterType type);
extern void RemovePgxcClass(Oid pcrelid);
#endif /* PGXC_CLASS_H */
diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h
index e8f2317c1b..8a837b39d5 100644
--- a/src/include/nodes/parsenodes.h
+++ b/src/include/nodes/parsenodes.h
@@ -1244,6 +1244,12 @@ typedef enum AlterTableType
AT_DropInherit, /* NO INHERIT parent */
AT_AddOf, /* OF <type_name> */
AT_DropOf, /* NOT OF */
+#ifdef PGXC
+ AT_DistributeBy, /* DISTRIBUTE BY ... */
+ AT_SubCluster, /* TO [ NODE nodelist | GROUP groupname ] */
+ AT_AddNodeList, /* ADD NODE nodelist */
+ AT_DeleteNodeList, /* DELETE NODE nodelist */
+#endif
AT_GenericOptions /* OPTIONS (...) */
} AlterTableType;
diff --git a/src/include/pgxc/copyops.h b/src/include/pgxc/copyops.h
new file mode 100644
index 0000000000..862dbbd299
--- /dev/null
+++ b/src/include/pgxc/copyops.h
@@ -0,0 +1,27 @@
+/*--------------------------------------------------------------------------
+ *
+ * copyops.h
+ * Routines for manipulation of remote COPY data
+ *
+ *
+ * Copyright (c) 2010-2012 Postgres-XC Development Group
+ *
+ *
+ * IDENTIFICATION
+ * src/include/pgxc/copyops.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef COPYOPS_H
+#define COPYOPS_H
+
+#include "access/tupdesc.h"
+
+/* Type of data delimiter used for data redistribution using remote COPY */
+#define COPYOPS_DELIMITER '\t'
+
+extern char **CopyOps_RawDataToArrayField(TupleDesc tupdesc, char *message, int len);
+extern char *CopyOps_BuildOneRowTo(TupleDesc tupdesc, Datum *values, bool *nulls, int *len);
+
+#endif
diff --git a/src/include/pgxc/execRemote.h b/src/include/pgxc/execRemote.h
index 32a88ecca4..5e26850d1c 100644
--- a/src/include/pgxc/execRemote.h
+++ b/src/include/pgxc/execRemote.h
@@ -48,6 +48,17 @@ typedef enum
REQUEST_TYPE_COPY_OUT /* Copy Out response */
} RequestType;
+/*
+ * Type of requests associated to a remote COPY OUT
+ */
+typedef enum
+{
+ REMOTE_COPY_NONE, /* Not defined yet */
+ REMOTE_COPY_STDOUT, /* Send back to client */
+ REMOTE_COPY_FILE, /* Write in file */
+ REMOTE_COPY_TUPLESTORE /* Store data in tuplestore */
+} RemoteCopyType;
+
/* Combines results of INSERT statements using multiple values */
typedef struct CombineTag
{
@@ -107,7 +118,8 @@ typedef struct RemoteQueryState
/* Simple DISTINCT support */
FmgrInfo *eqfunctions; /* functions to compare tuples */
MemoryContext tmp_ctx; /* separate context is needed to compare tuples */
- FILE *copy_file; /* used if copy_dest == COPY_FILE */
+ RemoteCopyType remoteCopyType; /* Type of remote COPY operation */
+ FILE *copy_file; /* used if remoteCopyType == REMOTE_COPY_FILE */
uint64 processed; /* count of data rows when running CopyOut */
/* cursor support */
char *cursor; /* cursor name */
@@ -136,7 +148,8 @@ extern void PGXCNodeCommitPrepared(char *gid);
/* Copy command just involves Datanodes */
extern PGXCNodeHandle** DataNodeCopyBegin(const char *query, List *nodelist, Snapshot snapshot);
extern int DataNodeCopyIn(char *data_row, int len, ExecNodes *exec_nodes, PGXCNodeHandle** copy_connections);
-extern uint64 DataNodeCopyOut(ExecNodes *exec_nodes, PGXCNodeHandle** copy_connections, FILE* copy_file);
+extern uint64 DataNodeCopyOut(ExecNodes *exec_nodes, PGXCNodeHandle** copy_connections, TupleDesc tupleDesc,
+ FILE* copy_file, Tuplestorestate *store, RemoteCopyType remoteCopyType);
extern void DataNodeCopyFinish(PGXCNodeHandle** copy_connections, int primary_dn_index, CombineType combine_type);
extern bool DataNodeCopyEnd(PGXCNodeHandle *handle, bool is_error);
extern int DataNodeCopyInBinaryForAll(char *msg_buf, int len, PGXCNodeHandle** copy_connections);
diff --git a/src/include/pgxc/locator.h b/src/include/pgxc/locator.h
index bd719911ea..78ce3cff00 100644
--- a/src/include/pgxc/locator.h
+++ b/src/include/pgxc/locator.h
@@ -99,6 +99,7 @@ extern RelationLocInfo *GetRelationLocInfo(Oid relid);
extern RelationLocInfo *CopyRelationLocInfo(RelationLocInfo *src_info);
extern char GetRelationLocType(Oid relid);
extern bool IsTableDistOnPrimary(RelationLocInfo *rel_loc_info);
+extern bool IsLocatorInfoEqual(RelationLocInfo *rel_loc_info1, RelationLocInfo *rel_loc_info2);
extern ExecNodes *GetRelationNodes(RelationLocInfo *rel_loc_info, Datum valueForDistCol,
bool isValueNull, Oid typeOfValueForDistCol,
RelationAccessType accessType);
diff --git a/src/include/pgxc/redistrib.h b/src/include/pgxc/redistrib.h
new file mode 100644
index 0000000000..ee94523dbb
--- /dev/null
+++ b/src/include/pgxc/redistrib.h
@@ -0,0 +1,80 @@
+/*-------------------------------------------------------------------------
+ *
+ * redistrib.h
+ * Routines related to online data redistribution
+ *
+ * Copyright (c) 2010-2012 Postgres-XC Development Group
+ *
+ *
+ * IDENTIFICATION
+ * src/include/pgxc/redistrib.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef REDISTRIB_H
+#define REDISTRIB_H
+
+#include "nodes/parsenodes.h"
+#include "utils/tuplestore.h"
+
+/*
+ * Type of data redistribution operations.
+ * Online data redistribution is made of one or more of those operations.
+ */
+typedef enum RedistribOperation {
+ DISTRIB_NONE, /* Default operation */
+ DISTRIB_DELETE_HASH, /* Perform a DELETE with hash value check */
+ DISTRIB_DELETE_MODULO, /* Perform a DELETE with modulo value check */
+ DISTRIB_COPY_TO, /* Perform a COPY TO */
+ DISTRIB_COPY_FROM, /* Perform a COPY FROM */
+ DISTRIB_TRUNCATE, /* Truncate relation */
+ DISTRIB_REINDEX /* Reindex relation */
+} RedistribOperation;
+
+/*
+ * Determine if operation can be done before or after
+ * catalog update on local node.
+ */
+typedef enum RedistribCatalog {
+ CATALOG_UPDATE_NONE, /* Default state */
+ CATALOG_UPDATE_AFTER, /* After catalog update */
+ CATALOG_UPDATE_BEFORE, /* Before catalog update */
+ CATALOG_UPDATE_BOTH /* Before and after catalog update */
+} RedistribCatalog;
+
+/*
+ * Redistribution command
+ * This contains the tools necessary to perform a redistribution operation.
+ */
+typedef struct RedistribCommand {
+ RedistribOperation type; /* Operation type */
+ ExecNodes *execNodes; /* List of nodes where to perform operation */
+ RedistribCatalog updateState; /* Flag to determine if operation can be done
+ * before or after catalog update */
+} RedistribCommand;
+
+/*
+ * Redistribution operation state
+ * Maintainer of redistribution state having the list of commands
+ * to be performed during redistribution.
+ * For the list of commands, we use an array and not a simple list as operations
+ * might need to be done in a certain order.
+ */
+typedef struct RedistribState {
+ Oid relid; /* Oid of relation redistributed */
+ List *commands; /* List of commands */
+ Tuplestorestate *store; /* Tuple store used for temporary data storage */
+} RedistribState;
+
+extern void PGXCRedistribTable(RedistribState *distribState, RedistribCatalog type);
+extern void PGXCRedistribCreateCommandList(RedistribState *distribState,
+ RelationLocInfo *newLocInfo);
+extern RedistribCommand *makeRedistribCommand(RedistribOperation type,
+ RedistribCatalog updateState,
+ ExecNodes *nodes);
+extern RedistribState *makeRedistribState(Oid relOid);
+extern void FreeRedistribState(RedistribState *state);
+extern void FreeRedistribCommand(RedistribCommand *command);
+
+#endif /* REDISTRIB_H */
diff --git a/src/include/pgxc/remotecopy.h b/src/include/pgxc/remotecopy.h
index 77134e71f9..93368c0ada 100644
--- a/src/include/pgxc/remotecopy.h
+++ b/src/include/pgxc/remotecopy.h
@@ -70,6 +70,7 @@ extern void RemoteCopy_BuildStatement(RemoteCopyData *state,
extern void RemoteCopy_GetRelationLoc(RemoteCopyData *state,
Relation rel,
List *attnums);
+extern RemoteCopyOptions *makeRemoteCopyOptions(void);
extern void FreeRemoteCopyData(RemoteCopyData *state);
extern void FreeRemoteCopyOptions(RemoteCopyOptions *options);
#endif
diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h
index fde1467185..4eaabe6592 100644
--- a/src/include/utils/rel.h
+++ b/src/include/utils/rel.h
@@ -365,6 +365,14 @@ typedef struct StdRdOptions
#define RelationUsesTempNamespace(relation) \
((relation)->rd_rel->relpersistence == RELPERSISTENCE_TEMP)
+#ifdef PGXC
+/*
+ * RelationGetLocInfo
+ * Return the location info of relation
+ */
+#define RelationGetLocInfo(relation) ((relation)->rd_locator_info)
+#endif
+
/*
* RELATION_IS_LOCAL
* If a rel is either temp or newly created in the current transaction,