I am really impressed with TSQL Fuzzy Look functionality......found somewhere during google.
-- ****************** Functions for Jaro **********************************************************************
IF EXISTS (SELECT * FROM sys.objects WHERE object_id = OBJECT_ID(N'[dbo].[fn_GetCommonCharacters]') AND type in (N'FN', N'IF', N'TF', N'FS', N'FT'))
DROP FUNCTION [dbo].[fn_GetCommonCharacters]
GO
CREATE FUNCTION [dbo].fn_GetCommonCharacters(@firstWord VARCHAR(MAX), @secondWord VARCHAR(MAX), @matchWindow INT)
RETURNS VARCHAR(MAX) AS
BEGIN
DECLARE @CommonChars VARCHAR(MAX)
DECLARE @copy VARCHAR(MAX)
DECLARE @char CHAR(1)
DECLARE @foundIT BIT
DECLARE @f1_len INT
DECLARE @f2_len INT
DECLARE @i INT
DECLARE @j INT
DECLARE @j_Max INT
SET @CommonChars = ''
IF @firstWord IS NOT NULL AND @secondWord IS NOT NULL
BEGIN
SET @f1_len = LEN(@firstWord)
SET @f2_len = LEN(@secondWord)
SET @copy = @secondWord
SET @i = 1
WHILE @i < (@f1_len + 1)
BEGIN
SET @char = SUBSTRING(@firstWord, @i, 1)
SET @foundIT = 0
-- Set J starting value
IF @i - @matchWindow > 1
BEGIN
SET @j = @i - @matchWindow
END
ELSE
BEGIN
SET @j = 1
END
-- Set J stopping value
IF @i + @matchWindow <= @f2_len
BEGIN
SET @j_Max = @i + @matchWindow
END
ELSE
IF @f2_len < @i + @matchWindow
BEGIN
SET @j_Max = @f2_len
END
WHILE @j < (@j_Max + 1) AND @foundIT = 0
BEGIN
IF SUBSTRING(@copy, @j, 1) = @char
BEGIN
SET @foundIT = 1
SET @CommonChars = @CommonChars + @char
SET @copy = STUFF(@copy, @j, 1, '#')
END
SET @j = @j + 1
END
SET @i = @i + 1
END
END
RETURN @CommonChars
END
GO
IF EXISTS (SELECT * FROM sys.objects WHERE object_id = OBJECT_ID(N'[dbo].[fn_calculateMatchWindow]') AND type in (N'FN', N'IF', N'TF', N'FS', N'FT'))
DROP FUNCTION [dbo].[fn_calculateMatchWindow]
GO
CREATE FUNCTION [dbo].[fn_calculateMatchWindow](@s1_len INT, @s2_len INT)
RETURNS INT AS
BEGIN
DECLARE @matchWindow INT
SET @matchWindow = CASE WHEN @s1_len >= @s2_len
THEN (@s1_len / 2) - 1
ELSE (@s2_len / 2) - 1
END
RETURN @matchWindow
END
GO
IF EXISTS (SELECT * FROM sys.objects WHERE object_id = OBJECT_ID(N'[dbo].[fn_calculateTranspositions]') AND type in (N'FN', N'IF', N'TF', N'FS', N'FT'))
DROP FUNCTION [dbo].[fn_calculateTranspositions]
GO
CREATE FUNCTION [dbo].[fn_calculateTranspositions](@s1_len INT, @str1 VARCHAR(MAX), @str2 VARCHAR(MAX))
RETURNS INT AS
BEGIN
DECLARE @transpositions INT
DECLARE @i INT
SET @transpositions = 0
SET @i = 0
WHILE @i < @s1_len
BEGIN
IF SUBSTRING(@str1, @i+1, 1) <> SUBSTRING(@str2, @i+1, 1)
BEGIN
SET @transpositions = @transpositions + 1
END
SET @i = @i + 1
END
SET @transpositions = @transpositions / 2
RETURN @transpositions
END
GO
IF EXISTS (SELECT * FROM sys.objects WHERE object_id = OBJECT_ID(N'[dbo].[fn_calculateJaro]') AND type in (N'FN', N'IF', N'TF', N'FS', N'FT'))
DROP FUNCTION [dbo].[fn_calculateJaro]
GO
CREATE FUNCTION [dbo].[fn_calculateJaro](@str1 VARCHAR(MAX), @str2 VARCHAR(MAX))
RETURNS FLOAT AS
BEGIN
DECLARE @Common1 VARCHAR(MAX)
DECLARE @Common2 VARCHAR(MAX)
DECLARE @Common1_Len INT
DECLARE @Common2_Len INT
DECLARE @s1_len INT
DECLARE @s2_len INT
DECLARE @transpose_cnt INT
DECLARE @match_window INT
DECLARE @jaro_distance FLOAT
SET @transpose_cnt = 0
SET @match_window = 0
SET @jaro_distance = 0
Set @s1_len = LEN(@str1)
Set @s2_len = LEN(@str2)
SET @match_window = dbo.fn_calculateMatchWindow(@s1_len, @s2_len)
SET @Common1 = dbo.fn_GetCommonCharacters(@str1, @str2, @match_window)
SET @Common1_Len = LEN(@Common1)
IF @Common1_Len = 0 OR @Common1 IS NULL
BEGIN
RETURN 0
END
SET @Common2 = dbo.fn_GetCommonCharacters(@str2, @str1, @match_window)
SET @Common2_Len = LEN(@Common2)
IF @Common1_Len <> @Common2_Len OR @Common2 IS NULL
BEGIN
RETURN 0
END
SET @transpose_cnt = dbo.[fn_calculateTranspositions](@Common1_Len, @Common1, @Common2)
SET @jaro_distance = @Common1_Len / (3.0 * @s1_len) +
@Common1_Len / (3.0 * @s2_len) +
(@Common1_Len - @transpose_cnt) / (3.0 * @Common1_Len);
RETURN @jaro_distance
END
GO
-- ****************** Functions for Jaro **********************************************************************
-- ****************** Functions for Jaro-Winkler ***************************************************************
IF EXISTS (SELECT * FROM sys.objects WHERE object_id = OBJECT_ID(N'[dbo].[fn_calculatePrefixLength]') AND type in (N'FN', N'IF', N'TF', N'FS', N'FT'))
DROP FUNCTION [dbo].[fn_calculatePrefixLength]
GO
CREATE FUNCTION [dbo].[fn_calculatePrefixLength](@firstWord VARCHAR(MAX), @secondWord VARCHAR(MAX))
RETURNS INT As
BEGIN
DECLARE @f1_len INT
DECLARE @f2_len INT
DECLARE @minPrefixTestLength INT
DECLARE @i INT
DECLARE @n INT
DECLARE @foundIT BIT
SET @minPrefixTestLength = 4
IF @firstWord IS NOT NULL AND @secondWord IS NOT NULL
BEGIN
SET @f1_len = LEN(@firstWord)
SET @f2_len = LEN(@secondWord)
SET @i = 0
SET @foundIT = 0
SET @n = CASE WHEN @minPrefixTestLength < @f1_len
AND @minPrefixTestLength < @f2_len
THEN @minPrefixTestLength
WHEN @f1_len < @f2_len
AND @f1_len < @minPrefixTestLength
THEN @f1_len
ELSE @f2_len
END
WHILE @i < @n AND @foundIT = 0
BEGIN
IF SUBSTRING(@firstWord, @i+1, 1) <> SUBSTRING(@secondWord, @i+1, 1)
BEGIN
SET @minPrefixTestLength = @i
SET @foundIT = 1
END
SET @i = @i + 1
END
END
RETURN @minPrefixTestLength
END
GO
IF EXISTS (SELECT * FROM sys.objects WHERE object_id = OBJECT_ID(N'[dbo].[fn_calculateJaroWinkler]') AND type in (N'FN', N'IF', N'TF', N'FS', N'FT'))
DROP FUNCTION [dbo].[fn_calculateJaroWinkler]
GO
CREATE FUNCTION [dbo].[fn_calculateJaroWinkler](@str1 VARCHAR(MAX), @str2 VARCHAR(MAX))
RETURNS float As
BEGIN
DECLARE @jaro_distance FLOAT
DECLARE @jaro_winkler_distance FLOAT
DECLARE @prefixLength INT
DECLARE @prefixScaleFactor FLOAT
SET @prefixScaleFactor = 0.1 --Constant = .1
SET @jaro_distance = dbo.fn_calculateJaro(@str1, @str2)
SET @prefixLength = dbo.fn_calculatePrefixLength(@str1, @str2)
SET @jaro_winkler_distance = @jaro_distance + ((@prefixLength * @prefixScaleFactor) * (1.0 - @jaro_distance))
RETURN @jaro_winkler_distance
END
GO
-- ****************** Functions for Jaro-Winkler ***************************************************************
-- ****************** Miscellaneous Functions *****************************************************************
IF EXISTS (SELECT * FROM sys.objects WHERE object_id = OBJECT_ID(N'[dbo].[fn_StrClean]') AND type in (N'FN', N'IF', N'TF', N'FS', N'FT'))
DROP FUNCTION [dbo].[fn_StrClean]
GO
CREATE Function [dbo].[fn_StrClean](@p_str1 VARCHAR(MAX))
RETURNS VARCHAR(MAX) as
BEGIN
DECLARE @ret_value VARCHAR(MAX)
SET @ret_value = @p_str1
SET @ret_value = REPLACE(@ret_value, '.', '')
SET @ret_value = REPLACE(@ret_value, ',', '')
SET @ret_value = REPLACE(@ret_value, '-', '')
SET @ret_value = REPLACE(@ret_value, ';', '')
SET @ret_value = REPLACE(@ret_value, ':', '')
RETURN @ret_value
END
GO
-- ****************** Miscellaneous Functions *****************************************************************
--For testing these functions run
--Example 1
declare @i float
select @i=[dbo].[fn_calculateJaroWinkler]('Peter','Pete')
print @i
--Example 2
CREATE TABLE [dbo].[NameInput](
[Cust_Id] [nvarchar](10) NULL,
[Name_Input] [nvarchar](10) NULL,
[Last_Name] [nchar](10) NULL
) ON [PRIMARY]
CREATE TABLE [dbo].[NameLookup](
[name_group_id] [nvarchar](10) NULL,
[first_name] [nvarchar](10) NULL,
[first_name_normalized] [nchar](10) NULL
) ON [PRIMARY]
insert into NameInput (Cust_Id, Name_Input, Last_Name) values ('11', 'Tomas', 'Jones ')
insert into NameInput (Cust_Id, Name_Input, Last_Name) values ('22', 'Peter', 'Jackson ')
insert into NameInput (Cust_Id, Name_Input, Last_Name) values ('33', 'Theresa', 'Smith ')
insert into NameInput (Cust_Id, Name_Input, Last_Name) values ('333', 'Therese', 'Smith ')
insert into NameInput (Cust_Id, Name_Input, Last_Name) values ('111', 'Tom', 'Jones ')
insert into NameInput (Cust_Id, Name_Input, Last_Name) values ('222', 'Pete', 'Jackson ')
insert into NameInput (Cust_Id, Name_Input, Last_Name) values ('55', 'Johnathan', 'Oconner ')
insert into NameInput (Cust_Id, Name_Input, Last_Name) values ('555', 'Johnatiag', 'Oconner ')
insert into NameInput (Cust_Id, Name_Input, Last_Name) values ('1111', 'Thhomas', 'Jones ')
insert into NameInput (Cust_Id, Name_Input, Last_Name) values ('2222', 'Petet', 'Jackson ')
insert into NameInput (Cust_Id, Name_Input, Last_Name) values ('3333', 'Theres', 'Smith ')
insert into NameInput (Cust_Id, Name_Input, Last_Name) values ('5555', 'Jonathan', 'Oconner ')
insert into NameLookup (name_group_id, first_name, first_name_normalized) values ('3', 'Theresa', 'Theresa ')
insert into NameLookup (name_group_id, first_name, first_name_normalized) values ('4', 'Johnathan', 'Johnathan ')
insert into NameLookup (name_group_id, first_name, first_name_normalized) values ('1', 'Tom', 'Thomas ')
insert into NameLookup (name_group_id, first_name, first_name_normalized) values ('2', 'Pete', 'Peter ')
insert into NameLookup (name_group_id, first_name, first_name_normalized) values ('1', 'Thomas', 'Thomas ')
insert into NameLookup (name_group_id, first_name, first_name_normalized) values ('2', 'Peter', 'Peter ')
SELECT NameLookup.name_group_id, NameInput.Cust_Id, NameInput.Name_Input, NameLookup.first_name ,NameLookup.first_name_normalized, NameInput.Last_Name
, dbo.[fn_calculateJaroWinkler](NameInput.Name_Input, NameLookup.first_name) AS Jaro3
, RANK() OVER (Partition BY NameLookup.name_group_id ORDER BY dbo.[fn_calculateJaroWinkler](NameInput.Name_Input, NameLookup.first_name) Desc) Jar02
FROM NameInput
CROSS JOIN NameLookup
Where dbo.[fn_calculateJaroWinkler](NameInput.Name_Input, NameLookup.first_name) > .95
order by NameLookup.name_group_id
--Example 3
DECLARE @SRC TABLE
(
source_string varchar(50) NOT NULL
, ref_id int identity(1,1) NOT NULL
);
-- Identify matches
DECLARE @WORK TABLE
(
source_ref_id int NOT NULL
, match_ref_id int NOT NULL
);
INSERT INTO
@src
SELECT 'Jon Q'
UNION ALL SELECT 'John Q'
UNION ALL SELECT 'JOHN Q'
UNION ALL SELECT 'Jonn Q'
-- Oops on matching joan to jon
UNION ALL SELECT 'Joan Q'
UNION ALL SELECT 'june'
UNION ALL SELECT 'Mary W'
UNION ALL SELECT 'Marie W'
UNION ALL SELECT 'Matt H';
-- 2 problems to address
-- duplicates in our inbound set
-- duplicates against a reference set
--
-- Better matching will occur if names are split into ordinal entities
-- Splitting on whitespace is always questionable
--
-- Mat, Matt, Matthew
DECLARE CSR CURSOR
READ_ONLY
FOR
SELECT DISTINCT
S1.source_string
, S1.ref_id
FROM
@SRC AS S1
ORDER BY
S1.ref_id;
DECLARE @source_string varchar(50), @ref_id int
OPEN CSR
FETCH NEXT FROM CSR INTO @source_string, @ref_id
WHILE (@@fetch_status <> -1)
BEGIN
IF (@@fetch_status <> -2)
BEGIN
IF NOT EXISTS
(
SELECT * FROM @WORK W WHERE W.match_ref_id = @ref_id
)
BEGIN
INSERT INTO
@WORK
SELECT
@ref_id
, S.ref_id
FROM
@src S
-- If we have already matched the value, skip it
LEFT OUTER JOIN
@WORK W
ON W.match_ref_id = S.ref_id
WHERE
-- Don't match yourself
S.ref_id <> @ref_id
-- arbitrary threshold, will need to examine this for sanity
AND dbo.FuzzyLookUp(@source_string, S.source_string) > .95
END
END
FETCH NEXT FROM CSR INTO @source_string, @ref_id
END
CLOSE CSR
DEALLOCATE CSR
------ Show me the list of all the unmatched rows
------ plus the retained
----;WITH MATCHES AS
----(
---- SELECT
---- S1.source_string
---- , S1.ref_id
---- , S2.source_string AS match_source_string
---- , S2.ref_id AS match_ref_id
---- FROM
---- @SRC S1
---- INNER JOIN
---- @WORK W
---- ON W.source_ref_id = S1.ref_id
---- INNER JOIN
---- @SRC S2
---- ON S2.ref_id = W.match_ref_id
----)
----, UNMATCHES AS
----(
---- SELECT
---- S1.source_string
---- , S1.ref_id
---- , NULL AS match_source_string
---- , NULL AS match_ref_id
---- FROM
---- @SRC S1
---- LEFT OUTER JOIN
---- @WORK W
---- ON W.source_ref_id = S1.ref_id
---- LEFT OUTER JOIN
---- @WORK S2
---- ON S2.match_ref_id = S1.ref_id
---- WHERE
---- W.source_ref_id IS NULL
---- and s2.match_ref_id IS NULL
----)
----SELECT
---- M.source_string
----, M.ref_id
----, M.match_source_string
----, M.match_ref_id
----FROM
---- MATCHES M
----UNION ALL
----SELECT
---- M.source_string
----, M.ref_id
----, M.match_source_string
----, M.match_ref_id
----FROM
---- UNMATCHES M;
------ To specifically solve your request
SELECT
S.source_string AS Name
, COALESCE(S2.source_string, S.source_string) As StdName
FROM
@SRC S
LEFT OUTER JOIN
@WORK W
ON W.match_ref_id = S.ref_id
LEFT OUTER JOIN
@SRC S2
ON S2.ref_id = W.source_ref_id