-
Notifications
You must be signed in to change notification settings - Fork 0
/
REgex.py
133 lines (88 loc) · 3.38 KB
/
REgex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# -*- coding: utf-8 -*-
"""
Created on Fri Sep 24 01:21:37 2021
@author: Aditi.Dhamat
"""
#Regex
import regex as re
sen = "I was born in 1992"
#Replaces with space at every digit encountered
sen1 = re.sub(r"\d", "", sen)
print(sen1)
#.* is for 0 or more characters
print(re.match(r".*", sen))
sen2 = ""
#.+ is for 1 or more characters
print(re.match(r".+", sen2))
print(re.match(r"[a-zA-Z]+", sen))
sen3 = "abb"
# ? matches 0 or only 1 character mentioned before it
#Like here it matches 0 or 1 'b' after a
print(re.match(r"ab?", sen3))
#Note: Match only looks at first pattern in the sentence
#It does not look further after it finds some pattern.
#For ex:
sen4 = "1992 was the year when I was born"
print(re.match(r"[a-zA-Z]+", sen4))
#Here , even if we have alphabets , it outputs none as
# it does not look further than 1992(first element only , local search)
# search fn does global search
print(re.search(r"[a-zA-Z]+", sen4))
# ^ caret is used for starts with some character,
# sentence start with some character/pattern
if re.match(r"^1992", sen4):
print("Match")
else:
print("No Match")
# $ is for identifying last character/ pattern
#can't use match fn here as it checks only first pattern
if re.search(r"born$", sen4):
print("Match")
else:
print("No Match")
#Substitution
sen5 = "I love Avengers"
#re.sub fn does global search and replace
print(re.sub(r"Avengers", "Justice League", sen5))
#re.I flag to case of characters
print(re.sub(r"[a-z]", "0", sen5, flags=re.I))
# here 1 specifies the count of characters to be replaced
print(re.sub(r"[a-z]", "0", sen5, 1, flags=re.I))
#Shorthand character classes
sent1 = "welcome to the new year party 2021"
sent2 = "Just ~% +++--- arrived at @Jack's place. #fun"
sent3 = "I love you"
print(re.sub(r"\d", "", sent1)) #Digit replace
print(re.sub(r"[@%'~+\.#]","", sent2)) #Match group of chars
print(re.sub(r"\w", " ", sent2)) #Remove word chars
sent2_mod = (re.sub(r"\W", " ", sent2)) #Keep only word chars
print(sent2_mod)
sent2_mod = re.sub(r"\s+", " ", sent2_mod) #Space replacement
#1 or more spaces with '+'
sent2_mod = re.sub(r"\s+[a-zA-Z]\s+", " ", sent2_mod)
sent3_mod = re.sub(r"\s+", " ", sent3)
# '-' means range of chars, for ex: [@#%$+-\.] --> this means it
# will try to find range of chars from + to .
"""So, in our example, "+-\." means the range of characters
from "+" to "." in terms of their ASCII value
(the range from 43-46), not the three characters themselves.
In our case this mistake doesn't
result in an error and still works as the range 43-46
is a valid range. But, by mistake if you put here
something like "+-#" then it would result into an error
as the ASCII range would 43-35 which is invalid.
Therefore, the best thing to do whenever you want
to just track "-" is to escape it, i.e. "\-" ."""
#Preprocessing using regex
X = ["This is a wolf #scary",
"Welcome to the jungle #missing",
"11322 the number to know",
"Remember the name s - John",
"I love you"]
for i in range(len(X)):
X[i] = re.sub(r"\W", " ", X[i])
X[i] = re.sub(r"\d", " ", X[i])
X[i] = re.sub(r"\s+[a-z]\s+", " ", X[i], flags=re.I)
X[i] = re.sub(r"\s+", " ", X[i])
X[i] = re.sub(r"^\s", "", X[i])
X[i] = re.sub(r"\s$", "", X[i])