'Removing duplicates by using the substring of attribute value in XSLT

I have an XML where duplicates can be found as the last part of <rf id>. For example in the below sample XML, <rf id ="cc_2_30"> and <rf id ="cc_3_30"> are duplicated since they both have '_30' at the end.

<rfs>
<rf id="cc_2_30">
  <addData>
  <CC/>
    <key/>
    <recs>
      <rec/>
    </recs>
  </addData>
</rf>
<rf id="cc_3_30">
  <addData>
  <CC/>
    <key/>
    <recs>
      <rec/>
    </recs>
  </addData>
</rf>
<rf id="cc_3_13">
  <addData>
  <CC/>
    <key/>
    <recs>
      <rec/>
    </recs>
  </addData>
</rf>
</rfs>

So my requirement is to have only one as below

<rfs>
<rf id="cc_2_30">
 <addData>
 <CC/>
   <key/>
   <recs>
     <rec/>
   </recs>
 </addData>
</rf>
<rf id="cc_3_13">
 <addData>
 <CC/>
   <key/>
   <recs>
     <rec/>
   </recs>
 </addData>
</rf>
</rfs>

My updated XSLT after applying code to remove duplicate is -

<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:gal="http://www.tridan.it/gal"  version="2.0">

    <xsl:output indent="yes" />
    <xsl:strip-space elements="*" />
    <xsl:variable name="APOS" select='"&apos;"'/>
    
    <xsl:function name="gal:formatId">
    <xsl:param name="unformattedId"/>
    <xsl:value-of select="translate($unformattedId, concat(' &amp;/()][*,’|\',$APOS), '_')"/>
    </xsl:function>

<xsl:template match="node()|@*">
    <xsl:copy>
        <xsl:apply-templates select="node()|@*"/>
    </xsl:copy>
    </xsl:template>
    
    <xsl:template match="/rfs"  exclude-result-prefixes="gal">
    <xsl:copy>
        <xsl:for-each-group select="rf/recs/rec/addData/entry[key='CC']" group-by="value[1]" >
            <xsl:variable name="cc1value" select="./value[1]"/>
            <xsl:for-each-group select="//rf/recs/rec/addData/entry[key='Att']" group-by="value[1]" >
                <xsl:variable name="cc2value" select="./value[1]"/>

            
                <xsl:for-each select="//rf[recs/rec/addData[entry[key='CC' and value=$cc1value] and entry[key='Att' and value=$cc2value]]]">
                    <xsl:variable name="famId">
                        <xsl:value-of select="substring-after(substring-after(substring-after(substring-after(./@id,'_'),'_'),'_'),'_')"></xsl:value-of>
                    </xsl:variable>
                    <rf id="{gal:formatId(concat($cc1value,'_',$cc2value,'_',$famId))}">
                        <xsl:copy-of select="./addData"/>
                        <cc><xsl:value-of select="gal:formatId(concat($cc1value,'_',$cc2value))"/></cc> 
                        <name><xsl:value-of select="./name"/></name>
                        <recs>          
                            <xsl:for-each select="recs/rec[addData[entry[key='CC' and value=$cc1value] and entry[key='Att' and value=$cc2value]]]">
                            <xsl:variable name="recID">
                                <xsl:value-of select="substring-after(substring-after(substring-after(substring-after(./@id,'_'),'_'),'_'),'_')"></xsl:value-of>
                            </xsl:variable>
                            <rec id ="{gal:formatId(concat($cc1value,'_',$cc2value,'_',$recID))}">
                                <xsl:copy-of select="./addData"/>
                            </rec>
                            </xsl:for-each>
                        </recs>
                    </rf>
                </xsl:for-each>
        
            </xsl:for-each-group>
        </xsl:for-each-group>
    </xsl:copy>
</xsl:template>

<xsl:template match="/recs">
    <xsl:copy>
        <xsl:copy-of select="rf"/>
        <xsl:for-each-group select="rf" group-by="tokenize(@id, '_')[last()]">
            <xsl:copy-of select="rf"/>
        </xsl:for-each-group>
    </xsl:copy>
</xsl:template>

</xsl:stylesheet>

What I am concerned now is the duplicates are removed but the <rf id=""/> , <cc> values and <rec id=""/> being updated which is not required. I want to retain the values that I have updated in the XSLT.

Please guide in this direction..



Solution 1:[1]

The sample has e.g. id="cc_2_30 with two _ so your XPath (I would use tokenize(@id, '_')[last()] given that you have XSLT 2) should use substring-after two times and not four times.

With all the edits and comments I supposed you want something along the lines of

<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:gal="http://www.tridan.it/gal"  version="2.0" exclude-result-prefixes="gal">

    <xsl:output indent="yes" />
    <xsl:strip-space elements="*" />
    <xsl:variable name="APOS" select='"&apos;"'/>
    
    <xsl:function name="gal:formatId">
    <xsl:param name="unformattedId"/>
    <xsl:value-of select="translate($unformattedId, concat(' &amp;/()][*,’|\',$APOS), '_')"/>
    </xsl:function>

<xsl:template match="node()|@*">
    <xsl:copy>
        <xsl:apply-templates select="node()|@*"/>
    </xsl:copy>
    </xsl:template>
    
    <xsl:template match="/rfs">
    <xsl:copy>
      <xsl:variable name="first-step">
        <xsl:for-each-group select="rf/recs/rec/addData/entry[key='CC']" group-by="value[1]" >
            <xsl:variable name="cc1value" select="./value[1]"/>
            <xsl:for-each-group select="//rf/recs/rec/addData/entry[key='Att']" group-by="value[1]" >
                <xsl:variable name="cc2value" select="./value[1]"/>

            
                <xsl:for-each select="//rf[recs/rec/addData[entry[key='CC' and value=$cc1value] and entry[key='Att' and value=$cc2value]]]">
                    <xsl:variable name="famId">
                        <xsl:value-of select="substring-after(substring-after(substring-after(substring-after(./@id,'_'),'_'),'_'),'_')"></xsl:value-of>
                    </xsl:variable>
                    <rf id="{gal:formatId(concat($cc1value,'_',$cc2value,'_',$famId))}">
                        <xsl:copy-of select="./addData"/>
                        <cc><xsl:value-of select="gal:formatId(concat($cc1value,'_',$cc2value))"/></cc> 
                        <name><xsl:value-of select="./name"/></name>
                        <recs>          
                            <xsl:for-each select="recs/rec[addData[entry[key='CC' and value=$cc1value] and entry[key='Att' and value=$cc2value]]]">
                            <xsl:variable name="recID">
                                <xsl:value-of select="substring-after(substring-after(substring-after(substring-after(./@id,'_'),'_'),'_'),'_')"></xsl:value-of>
                            </xsl:variable>
                            <rec id ="{gal:formatId(concat($cc1value,'_',$cc2value,'_',$recID))}">
                                <xsl:copy-of select="./addData"/>
                            </rec>
                            </xsl:for-each>
                        </recs>
                    </rf>
                </xsl:for-each>
        
            </xsl:for-each-group>
        </xsl:for-each-group>
     </xsl:variable>
      <xsl:for-each-group select="$first-step/rf" group-by="tokenize(@id, '_')[last()]">
           <xsl:apply-templates select="."/>
        </xsl:for-each-group>
    </xsl:copy>
</xsl:template>

</xsl:stylesheet>

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1