strcmp() is a library function in C/C++ which compares two strings. It takes two strings as input
parameter and decides which one is lexicographically larger or smaller: If the rst string is greater then
it returns a positive value, if the second string is greater it returns a negative value and if two strings
are equal it returns a zero. The code that is used to compare two strings in C/C++ library is shown
below:
Figure: The standard strcmp() code provided for this problem.
The number of comparisons required to compare two strings in strcmp() function is never returned
by the function. But for this problem you will have to do just that at a larger scale. strcmp() function
continues to compare characters in the same position of the two strings until two different characters
are found or both strings come to an end. Of course it assumes that last character of a string is a null
(`\0') character. For example the table below shows what happens when \than" and \that"; \therE"
and \the" are compared using strcmp() function. To understand how 7 comparisons are needed in
both cases please consult the code block given above.
分析:由于字符集计较大,需要用左儿子--右兄弟表示法保存Trie。
方法一:链表实现。
#include <cstdio>
#include <cstring>
#include <cstdlib>
using namespace std;
const int LEN = 1000 + 5;
struct Node{
int tot;// 当前节点为根的子树包含的叶结点总数
char c; // 结点上的字符
Node * lChild; //point to first son
Node * rChild; // point to next brother
};
// 字母表为全体小写字母的Trie
struct Trie{
Node* root;
long long ans;
Trie(){
root = (Node*)malloc(sizeof(Node));
}
void clear(){ // only one root node
root->lChild = NULL; //initialize root node
root->rChild = NULL;
root->tot = 0;
}
//insert string (include '\0')
void insert(char *s){
int n = strlen(s);
root->tot++;
Node* v;
Node* u = root;
for(int i = 0; i <= n; i++){
bool find = false; // find char s[i];
for(v = u->lChild; v ; v = v->rChild){ //遍历u的所有子节点
if(v->c == s[i]){
find = true;
break;
}
}
if(!find){
v = (Node*)malloc( sizeof(Node)); //build a new node v;
v->rChild = u->lChild;
u->lChild = v; // node v insert to head of list u;
v->tot = 0; // initialize node v;
v->c = s[i];
v->lChild = NULL;
}
u = v;
u->tot++;
}
}
// 统计LCP=u的所有单词两两的比较次数之和
void dfs(int depth,const Node* u){
if( u->lChild == NULL){ // leaf node
ans += u->tot * (u->tot-1) * depth; // 有可能同一单词重复出现多次
}else{
int sum = 0;
for(Node* v = u->lChild; v ; v = v->rChild){
sum += v->tot * (u->tot - v->tot); 子树v中选一个串,其他子树中再选一个
}
ans += sum/2 * (2*depth+1); // 相同比较两次(s[i] == t[i], s[i] == '\0'),不同在s[i] == t[i]退出
for(Node* v = u->lChild; v ; v = v->rChild){ //子节点当做新的根节点dfs
dfs(depth+1, v);
}
}
}
long long count(){
ans = 0;
dfs(0, root);
return ans;
}
};
char word[LEN];
Trie trie;
int main(int argc, char** argv) {
int n, kase = 0;
while( scanf("%d",&n) == 1 && n){
trie.clear();
for(int i = 0; i < n; i++){
scanf("%s",word);
trie.insert(word);
}
printf("Case %d: %lld\n", ++kase, trie.count());
}
return 0;
}
方法二:两个数组模拟实现链表。
#include <cstdio>
#include <cstring>
using namespace std;
const int LEN = 1000 + 5,MAX_NODE = 4001*1000 +5;
// 字母表为全体小写字母的Trie
struct Trie{
int head[MAX_NODE]; //point to first son
int next[MAX_NODE]; // point to next brother
int tot[MAX_NODE]; // tot[i]为第i个结点为根的子树包含的叶结点总数
int ch[MAX_NODE]; // ch[i]为第i个结点上的字符
int size;
long long ans;
void clear(){ //只有一个根节点
size = 1;
head[0] = next[0] = tot[0] = 0;
}
//insert string (include '\0')
void insert(char *s){
int u = 0, v, n = strlen(s);
tot[0]++;
for(int i = 0; i <= n; i++){
bool find = false; // find char s[i];
for(v = head[u]; v != 0; v = next[v]){
if(ch[v] == s[i]){
find = true;
break;
}
}
if(!find){
v = size++; //build a new node v;
next[v] = head[u];
head[u] = v; // node v insert to head of list;
tot[v] = 0; // initialize node v;
ch[v] = s[i];
head[v] = 0;
}
u = v;
tot[u]++;
}
}
// 统计LCP=u的所有单词两两的比较次数之和
void dfs(int depth,int u){
if( head[u] == 0){ // leaf node
ans += tot[u] * (tot[u]-1) * depth; // 有可能同一单词重复出现多次
}else{
int sum = 0;
for(int v = head[u]; v != 0; v = next[v]){
sum += tot[v] * (tot[u] - tot[v]); 子树v中选一个串,其他子树中再选一个
}
ans += sum/2 * (2*depth+1); // 相同比较两次(s[i] == t[i], s[i] == '\0'),不同在s[i] == t[i]退出
for(int v = head[u]; v != 0; v = next[v]){ //子节点当做新的根节点dfs
dfs(depth+1,v);
}
}
}
long long count(){
ans = 0;
dfs(0, 0);
return ans;
}
};
char word[LEN];
Trie trie;
int main(int argc, char** argv) {
int n, kase = 0;
while( scanf("%d",&n) == 1 && n){
trie.clear();
for(int i = 0; i < n; i++){
scanf("%s",word);
trie.insert(word);
}
printf("Case %d: %lld\n", ++kase, trie.count());
}
return 0;
}
Input
The input le contains maximum 10 sets of inputs. The description of each set is given below:
Each set starts with an integer N (0 < N < 4001) which denotes the total number of strings. Each
of the next N lines contains one string. Strings contain only alphanumerals (`0'...`9', `A'...`Z', `a'...`z')
have a maximum length of 1000, and a minimum length of 1.
Input is terminated by a line containing a single zero.
Output
For each set of input produce one line of output. This line contains the serial of output followed by
an integer T. This T denotes the total number of comparisons that are required in the strcmp()
function if all the strings are compared with one another exactly once. So for N strings the function
strcmp() will be called exactly N(N?1)
2 times. You have to calculate total number of comparisons
inside the strcmp() function in those N(N?1)
2 calls. You can assume that the value of T will t safely
in a 64-bit signed integer. Please note that the most straightforward solution (Worst Case Complexity
O(N2 1000)) will time out for this problem.
Sample Input
2
a
b
4
cat
hat
mat
sir
0
Sample Output
Case 1: 1
Case 2: 6