summaryrefslogtreecommitdiff
path: root/contrib/fulltextindex/fti.pl
blob: 17f000149b76b23ac8a59e9e0647942d5d0ba6ed (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
#!/usr/bin/perl
#
# This script substracts all suffixes of all words in a specific column in a table
# and generates output that can be loaded into a new table with the
# psql '\copy' command. The new table should have the following structure:
#
#	create table tab (
#		string text,
#		id oid
#	);
#
# Note that you cannot use 'copy' (the SQL-command) directly, because
# there's no '\.' included at the end of the output.
#
# The output can be fed through the UNIX commands 'uniq' and 'sort'
# to generate the smallest and sorted output to populate the fti-table.
#
# Example:
#
# 	fti.pl -u -d mydb -t mytable -c mycolumn,mycolumn2 -f myfile
#	sort -o myoutfile myfile
#	uniq myoutfile sorted-file
#
# 	psql -u mydb
#
#		\copy my_fti_table from myfile
#
#  		create index fti_idx on my_fti_table (string,id);
#
#		create function fti() returns trigger as
#			'/path/to/fti/file/fti.so'
#		language 'C';
#
#		create trigger my_fti_trigger after update or insert or delete
#			on mytable
#				for each row execute procedure fti(my_fti_table, mycolumn);
#
# Make sure you have an index on mytable(oid) to be able to do somewhat
# efficient substring searches.

#use lib '/usr/local/pgsql/lib/perl5/';
use lib '/mnt/web/guide/postgres/lib/perl5/site_perl';
use Pg;
use Getopt::Std;

$PGRES_EMPTY_QUERY    = 0 ;
$PGRES_COMMAND_OK     = 1 ;
$PGRES_TUPLES_OK      = 2 ;
$PGRES_COPY_OUT       = 3 ;
$PGRES_COPY_IN        = 4 ;
$PGRES_BAD_RESPONSE   = 5 ;
$PGRES_NONFATAL_ERROR = 6 ;
$PGRES_FATAL_ERROR    = 7 ;

# the minimum length of word to include in the full text index
$MIN_WORD_LENGTH = 2;

# the minimum length of the substrings in the full text index
$MIN_SUBSTRING_LENGTH = 2;

$[ = 0; # make sure string offsets start at 0

sub break_up {
	my $string = pop @_;

	# convert strings to lower case
	$string = lc($string);
	@strings = split(/\W+/, $string);
	@subs = ();

	foreach $s (@strings) {
		$len = length($s);
		next if ($len <= $MIN_WORD_LENGTH);
		for ($i = 0; $i <= $len - $MIN_SUBSTRING_LENGTH; $i++) {
			$tmp = substr($s, $i);
			push(@subs, $tmp);
		}
	}

	return @subs;
}

sub connect_db {
	my $dbname = shift @_;
	my $user   = shift @_;
	my $passwd = shift @_;

	if (!defined($dbname) || $dbname eq "") {
		return 1;
	}
	$connect_string = "dbname=$dbname";

	if ($user ne "") {
		if ($passwd eq "") {
			return 0;
		}
		$connect_string = "$connect_string user=$user password=$passwd ".
		  "authtype=password";
	}
	
	$PG_CONN = PQconnectdb($connect_string);

	if (PQstatus($PG_CONN)) {
		print STDERR "Couldn't make connection with database!\n";
		print STDERR PQerrorMessage($PG_CONN), "\n";
		return 0;
	}

	return 1;
}

sub quit_prog {
	close(OUT);
	unlink $opt_f;
	if (defined($PG_CONN)) {
		PQfinish($PG_CONN);
	}
	exit 1;
}

sub get_username {
	print "Username: ";
	chop($n = <STDIN>);

    return $n;;
}

sub get_password {
	print "Password: ";

	system("stty -echo < /dev/tty");
	chop($pwd = <STDIN>);
	print "\n";
	system("stty echo < /dev/tty");

	return $pwd;
}

sub main {
	getopts('d:t:c:f:u');

	if (!$opt_d || !$opt_t || !$opt_c || !$opt_f) {
		print STDERR "usage: $0 [-u] -d database -t table -c column[,column...] ".
		  "-f output-file\n";
		return 1;
	}

	@cols = split(/,/, $opt_c);

	if (defined($opt_u)) {
		$uname = get_username();
		$pwd   = get_password();
	} else {
		$uname = "";
		$pwd   = "";
	}

	$SIG{'INT'} = 'quit_prog';
	if (!connect_db($opt_d, $uname, $pwd)) {
		print STDERR "Connecting to database failed!\n";
		return 1;
	}

	if (!open(OUT, ">$opt_f")) {
		print STDERR "Couldnt' open file '$opt_f' for output!\n";
		return 1;
	}

	PQexec($PG_CONN, "SET search_path = public");
	PQexec($PG_CONN, "begin");

	$query = "declare C cursor for select (\"";
	$query .= join("\" || ' ' || \"", @cols);
	$query .= "\") as string, oid from $opt_t";
	$res = PQexec($PG_CONN, $query);
	if (!$res || (PQresultStatus($res) != $PGRES_COMMAND_OK)) {
		print STDERR "Error declaring cursor!\n";
		print STDERR PQerrorMessage($PG_CONN), "\n";
		PQfinish($PG_CONN);
		return 1;
	}
	PQclear($res);

	$query = "fetch in C";
	while (($res = PQexec($PG_CONN, $query)) &&
		   (PQresultStatus($res) == $PGRES_TUPLES_OK) &&
		   (PQntuples($res) == 1)) {
		$col = PQgetvalue($res, 0, 0);
		$oid = PQgetvalue($res, 0, 1);

		@subs = break_up($col);
		foreach $i (@subs) {
			print OUT "$i\t$oid\n";
		}
	}

	if (!$res || (PQresultStatus($res) != PGRES_TUPLES_OK)) {
		print STDERR "Error retrieving data from backend!\n";
		print STDERR PQerrorMEssage($PG_CONN), "\n";
		PQfinish($PG_CONN);
		return 1;
	}

	PQclear($res);
	PQfinish($PG_CONN);

	return 0;
}

exit main();